diff --git a/CMakeLists.txt b/CMakeLists.txt
index 14fc4d680..6e1ad87ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,6 +18,8 @@ if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_S
     "Enable -Werror flags to turn warnings into errors for supporting compilers.")
   set(OPENMP_LIBDIR_SUFFIX "" CACHE STRING
     "Suffix of lib installation directory, e.g. 64 => lib64")
+  # Do not use OPENMP_LIBDIR_SUFFIX directly, use OPENMP_INSTALL_LIBDIR.
+  set(OPENMP_INSTALL_LIBDIR "lib${OPENMP_LIBDIR_SUFFIX}")
 
   # Group test settings.
   set(OPENMP_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING
@@ -28,7 +30,7 @@ if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_S
 else()
   set(OPENMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR})
   # If building in tree, we honor the same install suffix LLVM uses.
-  set(OPENMP_LIBDIR_SUFFIX ${LLVM_LIBDIR_SUFFIX})
+  set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}")
 
   if (NOT MSVC)
     set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
diff --git a/README.rst b/README.rst
index ea79f1948..9fb34dfc2 100644
--- a/README.rst
+++ b/README.rst
@@ -257,9 +257,11 @@ Options for ``libomptarget``
 Options for ``NVPTX device RTL``
 --------------------------------
 
-**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``OFF|ON``
+**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``ON|OFF``
   Enable CUDA LLVM bitcode offloading device RTL. This is used for link time
-  optimization of the OMP runtime and application code.
+  optimization of the OMP runtime and application code. This option is enabled
+  by default if the build system determines that `CMAKE_C_COMPILER` is able to
+  compile and link the library.
 
 **LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""``
   Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only
diff --git a/libompd/cuda_examples/test_target_generic.c b/libompd/cuda_examples/test_target_generic.c
new file mode 100644
index 000000000..db881da5e
--- /dev/null
+++ b/libompd/cuda_examples/test_target_generic.c
@@ -0,0 +1,33 @@
+// Testing generic mode of nvptx devRtl
+#include <stdio.h>
+
+#pragma omp declare target
+void test_breakpoint() {
+  asm("");
+}
+#pragma omp end declare target
+
+void vec_mult(int N)
+{
+  int i;
+  float p[N], v1[N], v2[N];
+  //init(v1, v2, N);
+  #pragma omp target map(v1, v2, p)
+  {
+    test_breakpoint();
+  #pragma omp parallel for 
+  for (i=0; i<N; i++)
+  {
+    test_breakpoint();
+    p[i] = v1[i] * v2[i];
+  }
+    test_breakpoint();
+  }
+//output(p, N);
+}
+int main() {
+  printf("calling vec_mul...\n");
+  vec_mult(64);
+  printf("done\n");
+  return 0;
+}
diff --git a/libompd/cuda_examples/test_target_multilevel.c b/libompd/cuda_examples/test_target_multilevel.c
new file mode 100644
index 000000000..b7a58225e
--- /dev/null
+++ b/libompd/cuda_examples/test_target_multilevel.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <omp.h>
+
+#pragma omp declare target
+void test_breakpoint() {
+  asm("");
+}
+#pragma omp end declare target
+
+void vec_mult(int N)
+{
+  int i;
+  float p[N], v1[N], v2[N];
+  omp_set_nested(1);
+  #pragma omp target map(v1, v2, p)
+  {
+    omp_set_nested(1);
+  #pragma omp parallel shared(v1, v2, p, N) num_threads(4)
+  {
+    printf("Outer region - thread ID: %d\n", omp_get_thread_num());
+    #pragma omp for
+    for (int i = 0; i < N; ++i)
+    {
+      float acc = 0;
+      #pragma omp parallel shared(v1, v2, p, N) num_threads(4)
+      #pragma omp for
+      for(int j = 0; j < N; ++j)
+      {
+        test_breakpoint();
+        p[i] += v1[i] + v2[i];
+      }
+    }
+  }
+    printf("End of target region\n");
+  }
+//output(p, N);
+}
+int main() {
+  printf("calling vec_mul...\n");
+  vec_mult(64);
+  printf("done\n");
+  return 0;
+}
diff --git a/libompd/cuda_examples/test_target_noparallel.c b/libompd/cuda_examples/test_target_noparallel.c
new file mode 100644
index 000000000..2e2f2f51c
--- /dev/null
+++ b/libompd/cuda_examples/test_target_noparallel.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+
+#pragma omp declare target
+void test_breakpoint() {
+  asm("");
+}
+#pragma omp end declare target
+
+void vec_mult(int N)
+{
+  int i;
+  float p[N], v1[N], v2[N];
+  #pragma omp target map(v1, v2, p)
+  {
+    test_breakpoint();
+    p[0] = v[0] * v[0];
+  }
+}
+int main() {
+  printf("calling vec_mul...\n");
+  vec_mult(64);
+  printf("done\n");
+  return 0;
+}
diff --git a/libompd/cuda_examples/test_target_single.c b/libompd/cuda_examples/test_target_single.c
new file mode 100644
index 000000000..4a2bc3260
--- /dev/null
+++ b/libompd/cuda_examples/test_target_single.c
@@ -0,0 +1,29 @@
+#include <stdio.h>
+
+#pragma omp declare target
+float mult(float u, float v) {
+  return u * v;
+}
+#pragma omp end declare target
+
+void vec_mult(int N)
+{
+  int i;
+  float p[N], v1[N], v2[N];
+  //init(v1, v2, N);
+  #pragma omp target map(v1, v2, p)
+  {
+    #pragma omp parallel for
+    for (i=0; i<N; i++)
+    {
+      p[i] = mult(v1[i], v2[i]);
+    }
+  }
+//output(p, N);
+}
+int main() {
+  printf("calling vec_mul...\n");
+  vec_mult(64);
+  printf("done\n");
+  return 0;
+}
diff --git a/libompd/cuda_examples/test_target_spmd.c b/libompd/cuda_examples/test_target_spmd.c
new file mode 100644
index 000000000..e1fa82a94
--- /dev/null
+++ b/libompd/cuda_examples/test_target_spmd.c
@@ -0,0 +1,31 @@
+// Testing spmd mode
+#include <stdio.h>
+
+#pragma omp declare target
+void test_breakpoint() {
+  asm("");
+}
+#pragma omp end declare target
+
+void vec_mult(int N)
+{
+  int i;
+  float p[N], v1[N], v2[N];
+  //init(v1, v2, N);
+  #pragma omp target map(v1, v2, p)
+  {
+  #pragma omp parallel for
+  for (i=0; i<N; i++)
+  {
+    test_breakpoint();
+    p[i] = v1[i] * v2[i];
+  }
+  }
+//output(p, N);
+}
+int main() {
+  printf("calling vec_mul...\n");
+  vec_mult(2048);
+  printf("done\n");
+  return 0;
+}
diff --git a/libompd/cuda_examples/test_target_task.c b/libompd/cuda_examples/test_target_task.c
new file mode 100644
index 000000000..069e8c497
--- /dev/null
+++ b/libompd/cuda_examples/test_target_task.c
@@ -0,0 +1,33 @@
+#include <stdio.h>
+#include <stdint.h>
+
+#pragma omp declare target
+void task1() {
+  printf("Hello from Task 1\n");
+  uint32_t enter_frame = 0;
+  for(;1;) {
+  }
+}
+void task2() {
+  printf("Hello from Task 2\n");
+  for(;1;) {
+  }
+}
+#pragma omp end declare target
+
+int main() {
+  #pragma omp target
+  {
+  #pragma omp parallel num_threads(4)
+  {
+    #pragma omp single
+    {
+      #pragma omp task
+      task1();
+      #pragma omp task
+      task2();
+    }
+  }
+  }
+  return 0;
+}
diff --git a/libompd/gdb-wrapper/CMakeLists.txt b/libompd/gdb-wrapper/CMakeLists.txt
index c3ea2824c..ec87ef31b 100644
--- a/libompd/gdb-wrapper/CMakeLists.txt
+++ b/libompd/gdb-wrapper/CMakeLists.txt
@@ -1,5 +1,9 @@
 project (odb)
 
+cmake_minimum_required(VERSION 2.8)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+
 set (cppfiles
   InputOutputManager.cpp
   ChildProcess.cpp
@@ -30,14 +34,12 @@ add_executable (odb-bin ${cppfiles} odb.cpp)
 set_target_properties (odb-bin PROPERTIES OUTPUT_NAME odb)
 add_library (odb ${cppfiles})
 
-if (ODB_LINUX)
-target_link_libraries (odb-bin dl) 
-target_link_libraries (odb dl) 
-endif (ODB_LINUX)
+target_link_libraries (odb-bin dl)
+target_link_libraries (odb dl)
 
 include_directories (
 	${CMAKE_CURRENT_SOURCE_DIR}
-#	${CMAKE_CURRENT_SOURCE_DIR}/../src/
+	${CMAKE_CURRENT_SOURCE_DIR}/../src/
     ${CMAKE_BINARY_DIR}/include
 )
 
diff --git a/libompd/gdb-wrapper/Callbacks.cpp b/libompd/gdb-wrapper/Callbacks.cpp
index e15e7e795..77c91ec1b 100644
--- a/libompd/gdb-wrapper/Callbacks.cpp
+++ b/libompd/gdb-wrapper/Callbacks.cpp
@@ -37,17 +37,16 @@ void initializeCallbacks(const GdbProcessPtr &proc)
   gdb = proc;
 
   // Initialize static table
-  cb.dmemory_alloc  = CB_dmemory_alloc;
-  cb.dmemory_free   = CB_dmemory_free;
-  cb.print_string   = CB_print_string;
-  cb.get_thread_context_for_osthread = CB_thread_context;
-  cb.get_containing_process_context = CB_process_context;
-  cb.tsizeof_prim   = CB_tsizeof_prim;
-  cb.tsymbol_addr   = CB_tsymbol_addr;
-  cb.read_tmemory   = CB_read_tmemory;
-  cb.write_tmemory  = CB_write_tmemory;
-  cb.host_to_target  = CB_host_to_target;
-  cb.target_to_host  = CB_target_to_host;
+  cb.memory_alloc       = CB_dmemory_alloc;
+  cb.memory_free        = CB_dmemory_free;
+  cb.print_string       = CB_print_string;
+  cb.get_thread_context_for_thread_id = CB_thread_context;
+  cb.sizeof_types       = CB_tsizeof_prim;
+  cb.symbol_addr_lookup = CB_tsymbol_addr;
+  cb.read_memory        = CB_read_tmemory;
+  cb.write_memory       = CB_write_tmemory;
+  cb.host_to_device     = CB_host_to_target;
+  cb.device_to_host     = CB_target_to_host;
 }
 
 ompd_callbacks_t * getCallbacksTable()
@@ -78,14 +77,14 @@ ompd_rc_t CB_dmemory_free (
 
 ompd_rc_t CB_thread_context (
     ompd_address_space_context_t *context,
-    ompd_osthread_kind_t          kind,
+    ompd_thread_id_t             kind,
     ompd_size_t                   sizeof_osthread,
     const void*                   osthread,
     ompd_thread_context_t **tcontext
     )
 {
   ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle;
-  if (kind == ompd_osthread_cudalogical) {
+  if (kind == OMPD_THREAD_ID_CUDALOGICAL) {
     *tcontext = ((OMPDContext*)context)->getContextForThread((CudaThread*)osthread);
   }
   else {
@@ -126,7 +125,7 @@ void init_sizes(){
 
 ompd_rc_t CB_tsizeof_prim(
     ompd_address_space_context_t *context,
-    ompd_target_type_sizes_t *sizes)
+    ompd_device_type_sizes_t *sizes)
 {
   ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle;
   static int inited = 0;
@@ -135,7 +134,12 @@ ompd_rc_t CB_tsizeof_prim(
     inited=1;
     init_sizes();
   }
-  memcpy(sizes, prim_sizes, sizeof(prim_sizes[0])*ompd_type_max);
+  sizes->sizeof_char = prim_sizes[ompd_type_char];
+  sizes->sizeof_short = prim_sizes[ompd_type_short];
+  sizes->sizeof_int = prim_sizes[ompd_type_int];
+  sizes->sizeof_long = prim_sizes[ompd_type_long];
+  sizes->sizeof_long_long = prim_sizes[ompd_type_long_long];
+  sizes->sizeof_pointer = prim_sizes[ompd_type_pointer];  
 
   return ret;
 }
@@ -175,7 +179,7 @@ ompd_rc_t CB_tsymbol_addr(
   parser.matchAddressValue(gdb->readOutput().c_str(), addr);
 
   if (strlen(addr) > 0)
-    symbol_addr->address = (ompd_taddr_t) strtoull (addr, NULL, 0);
+    symbol_addr->address = (ompd_addr_t) strtoull (addr, NULL, 0);
   else if (strlen(addr) == 0)
     ret = ompd_rc_error;
 
@@ -267,7 +271,7 @@ ompd_rc_t CB_write_tmemory (
     ompd_address_space_context_t *context,
     ompd_thread_context_t *tcontext,
     ompd_address_t addr,
-    ompd_tword_t nbytes,
+    ompd_word_t nbytes,
     const void *buffer)
 {
   return ompd_rc_unsupported;
@@ -277,7 +281,7 @@ ompd_rc_t CB_read_tmemory (
     ompd_address_space_context_t *context,
     ompd_thread_context_t *tcontext,
     ompd_address_t addr,
-    ompd_tword_t nbytes,
+    ompd_word_t nbytes,
     void *buffer)
 {
   if (!context)
diff --git a/libompd/gdb-wrapper/Callbacks.h b/libompd/gdb-wrapper/Callbacks.h
index d93c74580..3e8f379be 100644
--- a/libompd/gdb-wrapper/Callbacks.h
+++ b/libompd/gdb-wrapper/Callbacks.h
@@ -48,7 +48,7 @@ ompd_rc_t CB_dmemory_free (
 
 ompd_rc_t CB_thread_context (
     ompd_address_space_context_t *context,
-    ompd_osthread_kind_t          kind,
+    ompd_thread_id_t          kind,
     ompd_size_t                   sizeof_osthread,
     const void*                   osthread,
     ompd_thread_context_t **tcontext);
@@ -59,7 +59,7 @@ ompd_rc_t CB_process_context (
 
 ompd_rc_t CB_tsizeof_prim (
     ompd_address_space_context_t *context,
-    ompd_target_type_sizes_t *sizes);
+    ompd_device_type_sizes_t *sizes);
 
 ompd_rc_t CB_tsymbol_addr (
     ompd_address_space_context_t *context,
@@ -71,7 +71,7 @@ ompd_rc_t CB_read_tmemory (
     ompd_address_space_context_t *context,
     ompd_thread_context_t *tcontext,
     const ompd_address_t addr,
-    ompd_tword_t nbytes,
+    ompd_word_t nbytes,
     void *buffer
     );
 
@@ -79,7 +79,7 @@ ompd_rc_t CB_write_tmemory (
     ompd_address_space_context_t *context,
     ompd_thread_context_t *tcontext,
     const ompd_address_t addr,
-    ompd_tword_t nbytes,
+    ompd_word_t nbytes,
     const void *buffer
     );
 
diff --git a/libompd/gdb-wrapper/CudaGdb.h b/libompd/gdb-wrapper/CudaGdb.h
index b690257b6..0408668b0 100644
--- a/libompd/gdb-wrapper/CudaGdb.h
+++ b/libompd/gdb-wrapper/CudaGdb.h
@@ -13,6 +13,7 @@
 #include <vector>
 #include <map>
 #include "ompd.h"
+#include "../src/ompd-private.h"
 
 struct CudaThread {
   ompd_cudathread_coord_t coord;
diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp
index b1a82e67f..b34b72396 100644
--- a/libompd/gdb-wrapper/OMPDCommand.cpp
+++ b/libompd/gdb-wrapper/OMPDCommand.cpp
@@ -5,14 +5,16 @@
  *      Author: Ignacio Laguna
  *     Contact: ilaguna@llnl.gov
  */
-#include <config.h>
+//#include <config.h>
 #include "OMPDCommand.h"
 #include "OMPDContext.h"
 #include "Callbacks.h"
 #include "OutputString.h"
 #include "Debug.h"
+#include "omp.h"
 #include "ompd.h"
-#include "ompd_test.h"
+//#include "ompd_test.h"
+#define ODB_LINUX
 #include "CudaGdb.h"
 
 #include <cstdlib>
@@ -23,24 +25,60 @@
 using namespace ompd_gdb;
 using namespace std;
 
-const char * ompd_state_names[256];
 extern OMPDHostContextPool * host_contextPool;
 
+/* --- OMPDIcvs ------------------------------------------------------------- */
+
+OMPDIcvs::OMPDIcvs(OMPDFunctionsPtr functions,
+                   ompd_address_space_handle_t *addrhandle)
+    : functions(functions) {
+  ompd_icv_id_t next_icv_id = ompd_icv_undefined;
+  int more = 1;
+  const char *next_icv_name_str;
+  ompd_scope_t next_scope;
+  ompd_rc_t ret = ompd_rc_ok;
+  while (more && ret == ompd_rc_ok) {
+    ret = functions->ompd_enumerate_icvs(addrhandle,
+                                         next_icv_id,
+                                         &next_icv_id,
+                                         &next_icv_name_str,
+                                         &next_scope,
+                                         &more);
+    if (ret == ompd_rc_ok) {
+      availableIcvs[next_icv_name_str] =
+          std::pair<ompd_icv_id_t, ompd_scope_t>(next_icv_id, next_scope);
+    }
+  }
+}
+
+
+ompd_rc_t OMPDIcvs::get(ompd_parallel_handle_t *handle, const char *name,
+                        ompd_word_t *value) {
+  ompd_icv_id_t icv;
+  ompd_scope_t scope;
+
+  auto &p = availableIcvs.at(name);
+  icv = p.first;
+  scope = p.second;
+
+  if (scope != ompd_scope_parallel) {
+    return ompd_rc_bad_input;
+  }
+
+  return functions->ompd_get_icv_from_scope((void *)handle, scope, icv, value);
+}
+
 /* --- OMPDCommandFactory --------------------------------------------------- */
 
 OMPDCommandFactory::OMPDCommandFactory()
 {
   functions = OMPDFunctionsPtr(new OMPDFunctions);
 
-#define ompd_state_macro(state, code) ompd_state_names[code] = #state;
-    FOREACH_OMPD_STATE(ompd_state_macro)
-#undef ompd_state_macro
-
   // Load OMPD DLL and get a handle
 #ifdef ODB_LINUX
-  functions->ompdLibHandle = dlopen("libompd_intel.so", RTLD_LAZY);
+  functions->ompdLibHandle = dlopen("libompd.so", RTLD_LAZY);
 #elif defined(ODB_MACOS)
-  functions->ompdLibHandle = dlopen("libompd_intel.dylib", RTLD_LAZY);
+  functions->ompdLibHandle = dlopen("libompd.dylib", RTLD_LAZY);
 #else
 #error Unsupported platform!
 #endif
@@ -67,16 +105,24 @@ OMPDCommandFactory::OMPDCommandFactory()
 
 FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION)
 #undef OMPD_FIND_API_FUNCTION
+}
 
-  //functions->test_CB_tsizeof_prim   =
-  //   (void (*)()) findFunctionInLibrary("test_CB_tsizeof_prim");
-  //functions->test_CB_dmemory_alloc  =
-  //   (void (*)()) findFunctionInLibrary("test_CB_dmemory_alloc");
+OMPDCommandFactory::~OMPDCommandFactory()
+{
+  ompd_rc_t ret;
+  ret = functions->ompd_release_address_space_handle(addrhandle);
+  if (ret != ompd_rc_ok)
+  {
+    out << "ERROR: could not finalize target address space\n";
+  }
+}
 
+void OMPDCommandFactory::initOmpd()
+{
   // Initialize OMPD library
   ompd_callbacks_t *table = getCallbacksTable();
   assert(table && "Invalid callbacks table");
-  ompd_rc_t ret = functions->ompd_initialize(table);
+  ompd_rc_t ret = functions->ompd_initialize(0, table);
   if (ret != ompd_rc_ok)
   {
     out << "ERROR: could not initialize OMPD\n";
@@ -86,22 +132,12 @@ FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION)
                                           /*&prochandle, */&addrhandle);
   if (ret != ompd_rc_ok)
   {
+    addrhandle = nullptr;
     out << "ERROR: could not initialize target process\n";
   }
-}
-
-OMPDCommandFactory::~OMPDCommandFactory()
-{
-  ompd_rc_t ret;
-//   ret = functions->ompd_process_finalize(prochandle);
-//   if (ret != ompd_rc_ok)
-//   {
-//     out << "ERROR: could not finalize target process\n";
-//   }
-  ret = functions->ompd_release_address_space_handle(addrhandle);
-  if (ret != ompd_rc_ok)
+  else
   {
-    out << "ERROR: could not finalize target address space\n";
+    icvs = OMPDIcvsPtr(new OMPDIcvs(functions, addrhandle));
   }
 }
 
@@ -122,21 +158,28 @@ void * OMPDCommandFactory::findFunctionInLibrary(const char *fun) const
   return ret;
 }
 
-OMPDCommand* OMPDCommandFactory::create(const char *str, const vector<string>& extraArgs) const
+OMPDCommand* OMPDCommandFactory::create(const char *str, const vector<string>& extraArgs)
 {
+  if (addrhandle == nullptr) {
+    initOmpd();
+  }
+
   if (strcmp(str, "test") == 0)
     return new OMPDTestCallbacks(functions, addrhandle, extraArgs);
   else if (strcmp(str, "threads") == 0)
     return new OMPDThreads(functions, addrhandle, extraArgs);
   else if (strcmp(str, "levels") == 0)
-    return new OMPDLevels(functions, addrhandle, extraArgs);
+    return new OMPDLevels(functions, addrhandle, icvs, extraArgs);
   else if (strcmp(str, "callback") == 0)
     return new OMPDCallback(functions, addrhandle, extraArgs);
   else if (strcmp(str, "api") == 0)
     return new OMPDApi(functions, addrhandle, extraArgs);
   else if (strcmp(str, "testapi") == 0)
-    return new OMPDTest(functions, addrhandle, extraArgs);
-
+    return new OMPDTest(functions, addrhandle, icvs, extraArgs);
+  else if (strcmp(str, "parallel") == 0)
+    return new OMPDParallelRegions(functions, addrhandle, icvs, extraArgs);
+  else if (strcmp(str, "tasks") == 0)
+    return new OMPDTasks(functions, addrhandle, icvs, extraArgs);
   return new OMPDNull;
 }
 
@@ -183,6 +226,17 @@ const char* OMPDTestCallbacks::toString() const
 
 void OMPDThreads::execute() const
 {
+  // get state names
+  map<ompd_word_t, const char*> host_state_names;
+  ompd_word_t more_states = 1;
+  ompd_word_t next_state = omp_state_undefined;
+  host_state_names[next_state] = "ompd_state_undefined";
+  while (more_states) {
+    const char *state_name;
+    functions->ompd_enumerate_states(addrhandle, next_state, &next_state, &state_name, &more_states);
+    host_state_names[next_state] = state_name;
+  }
+
   printf("\nHOST THREADS\n");
   printf("Debugger_handle    Thread_handle     System_thread\n");
   printf("--------------------------------------------------\n");
@@ -191,15 +245,16 @@ void OMPDThreads::execute() const
   for(auto i: thread_ids) {
     ompd_thread_handle_t* thread_handle;
     ompd_rc_t ret = functions->ompd_get_thread_handle(
-        addrhandle, ompd_osthread_pthread, sizeof(i.second),
+        addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second),
         &(i.second), &thread_handle);
     if (ret == ompd_rc_ok)
     {
-      ompd_state_t state;
+      ompd_word_t state;
       ompd_wait_id_t wait_id;
       ret = functions->ompd_get_state(thread_handle, &state, &wait_id);
-      printf("  %-12u     %p     0x%lx\t%i\t%lx\n", 
-          (unsigned int)i.first, thread_handle, i.second, state, wait_id);
+      printf("  %-12u     %p     0x%lx\t%s\t%lx\n",
+          (unsigned int)i.first, thread_handle, i.second, host_state_names[state], wait_id);
+      functions->ompd_release_thread_handle(thread_handle);
     }
     else
     {
@@ -211,7 +266,20 @@ void OMPDThreads::execute() const
   int omp_cuda_threads = 0;
   vector<OMPDCudaContextPool*> cuda_ContextPools;
   map<uint64_t, bool> device_initialized;
-  map<ompd_taddr_t, ompd_address_space_handle_t*> address_spaces;
+  map<ompd_addr_t, ompd_address_space_handle_t*> address_spaces;
+  ompd_word_t last_state = -1;
+  ompd_cudathread_coord_t last_coords;
+  vector<ompd_thread_handle_t *> device_thread_handles;
+
+  // get cuda states
+  map<ompd_word_t, const char*> cuda_state_names;
+  more_states = 1;
+  next_state = omp_state_undefined;
+  cuda_state_names[next_state] = "omp_state_undefined";
+
+  printf("\nCUDA THREADS\n");
+  printf("Cuda block  from Thread  to Thread  state\n");
+  printf("------------------------------------------\n");
 
   for(auto i: cuda.threads) {
     if (!device_initialized[i.coord.cudaContext]) {
@@ -221,26 +289,76 @@ void OMPDThreads::execute() const
 
       device_initialized[i.coord.cudaContext] = true;
       result = functions->ompd_device_initialize(
-           cpool->getGlobalOmpdContext(),
-           i.coord.cudaContext,
-           ompd_device_kind_cuda, 
+          addrhandle,
+          cpool->getGlobalOmpdContext(),
+          OMPD_DEVICE_KIND_CUDA,
+          sizeof(i.coord.cudaContext),
+          &i.coord.cudaContext,
           &cpool->ompd_device_handle);
 
         if (result != ompd_rc_ok)
+        {
           continue;
+        }
 
         address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle;
+        while (more_states) {
+          const char *state_name;
+          functions->ompd_enumerate_states(cpool->ompd_device_handle,
+                                           next_state, &next_state,
+                                           &state_name, &more_states);
+          cuda_state_names[next_state] = state_name;
+        }
     }
 
     ompd_thread_handle_t* thread_handle;
     ompd_rc_t ret = functions->ompd_get_thread_handle(
                                     address_spaces[i.coord.cudaContext],
-                                    ompd_osthread_cudalogical,
+                                    OMPD_THREAD_ID_CUDALOGICAL,
                                     sizeof(i.coord), &i.coord,
                                     &thread_handle);
 
     if (ret == ompd_rc_ok)
+    {
+      ompd_word_t state;
+      device_thread_handles.push_back(thread_handle);
+      ret = functions->ompd_get_state(thread_handle, &state, NULL);
+      if (last_state == -1) {
+        last_state = state;
+        last_coords = i.coord;
+        printf("(%li,0,0)  (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x);
+      } else if (state != last_state || i.coord.blockIdx.x != last_coords.blockIdx.x || i.coord.threadIdx.x != last_coords.threadIdx.x + 1) {
+        printf("  (%li,0,0)  %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]);
+        last_coords = i.coord;
+        last_state = state;
+        printf("(%li,0,0)  (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x);
+      } else { /* state == last_state*/
+        last_coords = i.coord;
+      }
       omp_cuda_threads++;
+    }
+  }
+  // Check for non-unique handles
+  for (auto i: device_thread_handles) {
+    for (auto j: device_thread_handles) {
+      int value;
+      if (i == j) {
+        continue;
+      }
+      ompd_rc_t ret = functions->ompd_thread_handle_compare(i, j, &value);
+      if (!value) {
+        printf("FOUND NON-UNIQUE THREAD HANDLES FOR DIFFERENT THREADS\n");
+      }
+    }
+  }
+
+  // release thread handles
+  for (auto i: device_thread_handles) {
+    functions->ompd_release_thread_handle(i);
+  }
+
+  if (last_state != -1) {
+    printf("  (%li,0,0)  %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]);
   }
 
   if (cuda.threads.size() != 0) {
@@ -259,30 +377,28 @@ const char* OMPDThreads::toString() const
 
 void OMPDLevels::execute() const
 {
-/*  ompd_size_t num_os_threads;
-  ompd_rc_t ret = CB_num_os_threads(contextPool->getGlobalOmpdContext(), &num_os_threads);
-  assert(ret==ompd_rc_ok && "Error calling OMPD!");
-  ompd_osthread_t* osThreads =  (ompd_osthread_t*)
-      malloc(sizeof(ompd_osthread_t)*num_os_threads);
-  ret = CB_get_os_threads (contextPool->getGlobalOmpdContext(),  &num_os_threads, &osThreads);
-  assert(ret==ompd_rc_ok && "Error calling OMPD!");
-
+  ompd_rc_t ret;
   printf("\n");
   printf("Thread_handle     Nesting_level\n");
   printf("-------------------------------\n");
-  for (size_t i=0; i < num_os_threads; ++i)
+  for (auto i: getThreadIDsFromDebugger())
   {
-    ompd_thread_handle_t thread_handle;
+    ompd_thread_handle_t *thread_handle;
+    ompd_parallel_handle_t *parallel_handle;
     ret = functions->ompd_get_thread_handle(
-            contextPool->getGlobalOmpdContext(), &(osThreads[i]), &thread_handle);
+        addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second) ,&(i.second), &thread_handle);
+    if (ret != ompd_rc_ok) {
+      continue;
+    }
+    ret = functions->ompd_get_current_parallel_handle(thread_handle,
+                                                      &parallel_handle);
     if (ret == ompd_rc_ok)
     {
-      ompd_tword_t level=0;
-      ret = functions->ompd_nesting_level(
-          contextPool->getGlobalOmpdContext(), &thread_handle, &level);
-      printf("%-12u      %ld\n", (unsigned int)thread_handle, level);
+      ompd_word_t level=0;
+      icvs->get(parallel_handle, "levels-var", &level);
+      printf("%-12p      %ld\n", thread_handle, level);
     }
-  }*/
+  }
 }
 
 const char* OMPDLevels::toString() const
@@ -293,23 +409,9 @@ const char* OMPDLevels::toString() const
 
 /* --- OMPDCallback ----------------------------------------------------------- */
 
-ompd_target_prim_types_t get_prim_type_from_string(const string& str)
-{
-  const char * names[ompd_type_max] = { 
-    "CHAR", 
-    "SHORT",
-    "INT",
-    "LONG",
-    "LONG_LONG",
-    "POINTER"
-    };
-  for (int i = 0; 0<ompd_type_max; i++)
-    if (str == names[i])
-      return (ompd_target_prim_types_t) i;
-}
 
 void OMPDCallback::execute() const
-{ 
+{
   ompd_rc_t ret;
 
   if (extraArgs.empty() || extraArgs[0] == "help")
@@ -317,11 +419,11 @@ void OMPDCallback::execute() const
     hout << "callbacks available: read_tmemory, ttype, ttype_sizeof, ttype_offset, tsymbol_addr" << endl
          << "Use \"odb callback <callback_name>\" to get more help on the usage" << endl;
     return;
-  }     
+  }
 
 /*ompd_rc_t CB_read_tmemory (
     ompd_context_t *context,
-    ompd_taddr_t addr,
+    ompd_addr_t addr,
     ompd_tword_t bufsize,
     void *buffer
     );*/
@@ -333,7 +435,7 @@ void OMPDCallback::execute() const
       return;
     }
     long long temp=0;
-    ompd_taddr_t addr = (ompd_taddr_t)strtoll(extraArgs[1].c_str(), NULL, 0);
+    ompd_addr_t addr = (ompd_addr_t)strtoll(extraArgs[1].c_str(), NULL, 0);
     int cnt = atoi(extraArgs[2].c_str());
     ret = CB_read_tmemory(
             host_contextPool->getGlobalOmpdContext(), NULL, {0,addr}, cnt, &temp);
@@ -345,7 +447,7 @@ void OMPDCallback::execute() const
 /*ompd_rc_t CB_tsymbol_addr (
     ompd_context_t *context,
     const char *symbol_name,
-    ompd_taddr_t *symbol_addr);*/
+    ompd_addr_t *symbol_addr);*/
 
   if (extraArgs[0] == "tsymbol_addr")
   {
@@ -370,7 +472,7 @@ const char* OMPDCallback ::toString() const
 }
 
 void OMPDApi::execute() const
-{ 
+{
   ompd_rc_t ret;
 
   if (extraArgs.empty() || extraArgs[0] == "help")
@@ -378,7 +480,7 @@ void OMPDApi::execute() const
     hout << "API functions available: read_tmemory, ttype, ttype_sizeof, ttype_offset, tsymbol_addr" << endl
          << "Use \"odb api <function_name>\" to get more help on the usage" << endl;
     return;
-  }     
+  }
 
 //ompd_rc_t ompd_get_threads (
 //    ompd_context_t *context,    /* IN: debugger handle for the target */
@@ -388,6 +490,7 @@ void OMPDApi::execute() const
 
   if (extraArgs[0] == "get_threads")
   {
+#if 0
     if(extraArgs.size()>1)
     {
       hout << "Usage: odb api get_threads" << endl;
@@ -395,8 +498,8 @@ void OMPDApi::execute() const
     }
     ompd_thread_handle_t ** thread_handle_array;
     int num_handles;
-    
-  
+
+
     ret = functions->ompd_get_threads (
           addrhandle, &thread_handle_array, &num_handles);
     if (ret != ompd_rc_ok)
@@ -404,8 +507,10 @@ void OMPDApi::execute() const
     sout << num_handles << " OpenMP threads:" << endl;
     for (int i=0; i<num_handles; i++){
       sout << "0x" << hex << thread_handle_array[i] << ", ";
-    }    
+    }
     sout << endl << "";
+#endif
+    hout << "The 'odb api threads' command has been temporarily removed for the migration to a new ompd standard\n";
   }
 
 }
@@ -424,7 +529,7 @@ vector<ompd_thread_handle_t*> odbGetThreadHandles(ompd_address_space_handle_t* a
   {
     ompd_thread_handle_t* thread_handle;
     ret = functions->ompd_get_thread_handle(
-        addrhandle, ompd_osthread_pthread, sizeof(i.second) ,&(i.second), &thread_handle);
+        addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second) ,&(i.second), &thread_handle);
     if (ret!=ompd_rc_ok)
       continue;
     thread_handles.push_back(thread_handle);
@@ -432,24 +537,74 @@ vector<ompd_thread_handle_t*> odbGetThreadHandles(ompd_address_space_handle_t* a
   return thread_handles;
 }
 
+map<uint64_t, OMPDCudaContextPool> odbInitCudaDevices(OMPDFunctionsPtr functions, CudaGdb &cuda,
+                                                      ompd_address_space_handle_t *addrhandle)
+{
+  map<uint64_t, OMPDCudaContextPool> ret;
+  map<uint64_t, bool> device_initialized;
+  for (auto i: cuda.threads) {
+    if (!device_initialized[i.coord.cudaContext]) {
+      ret.emplace(i.coord.cudaContext, &i);
+      device_initialized[i.coord.cudaContext] = true;
+      functions->ompd_device_initialize(
+          addrhandle,
+          ret.at(i.coord.cudaContext).getGlobalOmpdContext(),
+          OMPD_DEVICE_KIND_CUDA,
+          sizeof(i.coord.cudaContext),
+          &i.coord.cudaContext,
+          &ret.at(i.coord.cudaContext).ompd_device_handle);
+    }
+  }
+  return ret;
+}
+
+vector<ompd_thread_handle_t*> odbGetCudaThreadHandles(
+    OMPDFunctionsPtr functions,
+    CudaGdb &cuda,
+    map<uint64_t, OMPDCudaContextPool> &device_handles)
+{
+  ompd_rc_t ret;
+
+  vector<ompd_thread_handle_t *> device_thread_handles;
+
+  for(auto i: cuda.threads) {
+    ompd_thread_handle_t* thread_handle;
+    ompd_rc_t ret = functions->ompd_get_thread_handle(
+                                    device_handles.at(i.coord.cudaContext).ompd_device_handle,
+                                    OMPD_THREAD_ID_CUDALOGICAL,
+                                    sizeof(i.coord), &i.coord,
+                                    &thread_handle);
+
+    if (ret == ompd_rc_ok)
+    {
+      device_thread_handles.push_back(thread_handle);
+    }
+  }
+
+  return device_thread_handles;
+}
+
 vector<ompd_parallel_handle_t*> odbGetParallelRegions(OMPDFunctionsPtr functions, ompd_thread_handle_t* &th)
 {
   ompd_rc_t ret;
   ompd_parallel_handle_t * parallel_handle;
   vector<ompd_parallel_handle_t*> parallel_handles;
-  ret = functions->ompd_get_top_parallel_region(
-          th, &parallel_handle);  
+  ret = functions->ompd_get_current_parallel_handle(
+          th, &parallel_handle);
   while(ret == ompd_rc_ok)
   {
     parallel_handles.push_back(parallel_handle);
     ret = functions->ompd_get_enclosing_parallel_handle(
-          parallel_handle, &parallel_handle);  
+          parallel_handle, &parallel_handle);
   }
   return parallel_handles;
 }
 
 bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector<ompd_parallel_handle_t*> phs)
 {
+  sout << "Checking of parallel IDs has been disabled for upgrade of ompd in branch ompd-devices\n";
+  return true;
+#if 0
   bool res=true;
 //  ompd_rc_t ret;
   int i=0;
@@ -466,10 +621,14 @@ bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector<ompd_parallel_handle
     if (ompt_res != ompd_res) res=false;
   }
   return res;
+#endif
 }
 
 bool odbCheckParallelNumThreads(OMPDFunctionsPtr functions, vector<ompd_parallel_handle_t*> phs)
 {
+  sout << "Checking of parallel IDs has been disable for upgrade of ompd in branch ompd-devices\n";
+  return true;
+#if 0
   bool res=true;
 //  ompd_rc_t ret;
   int i=0;
@@ -486,10 +645,14 @@ bool odbCheckParallelNumThreads(OMPDFunctionsPtr functions, vector<ompd_parallel
     if (ompt_res != ompd_res) res=false;
   }
   return res;
+#endif
 }
 
 bool odbCheckTaskIDs(OMPDFunctionsPtr functions, vector<ompd_task_handle_t*> ths)
 {
+  sout << "Checking of task IDs has been disable for upgrade of ompd in branch ompd-devices\n";
+  return true;
+#if 0
   bool res=true;
 //  ompd_rc_t ret;
   int i=0;
@@ -506,20 +669,21 @@ bool odbCheckTaskIDs(OMPDFunctionsPtr functions, vector<ompd_task_handle_t*> ths
     if (ompt_res != ompd_res) res=false;
   }
   return res;
+#endif
 }
 
 vector<ompd_task_handle_t*> odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_thread_handle_t* th)
 {
   ompd_rc_t ret;
-  ompd_task_handle_t * task_handle;
+  ompd_task_handle_t *task_handle;
   vector<ompd_task_handle_t*> task_handles;
-  ret = functions->ompd_get_top_task_region(
-          th, &task_handle);  
+  ret = functions->ompd_get_current_task_handle(
+          th, &task_handle);
   while(ret == ompd_rc_ok)
   {
     task_handles.push_back(task_handle);
-    ret = functions->ompd_get_ancestor_task_region(
-          task_handle, &task_handle);  
+    ret = functions->ompd_get_generating_task_handle(
+          task_handle, &task_handle); // Is it generating or scheduling task or something different?
   }
   return task_handles;
 }
@@ -527,28 +691,76 @@ vector<ompd_task_handle_t*> odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_t
 vector<ompd_task_handle_t*> odbGetImplicitTasks(OMPDFunctionsPtr functions, ompd_parallel_handle_t* ph)
 {
 //  ompd_rc_t ret;
-  ompd_task_handle_t** task_handles;
-  int num_tasks;
+  int num_tasks = evalGdbExpression("call omp_get_num_threads()");
   vector<ompd_task_handle_t*> return_handles;
-  /*ret = */functions->ompd_get_implicit_task_in_parallel(
-          ph, &task_handles, &num_tasks);  
-  for(int i=0; i<num_tasks; i++)
-  {
-    return_handles.push_back(task_handles[i]);
+
+  for (int i=0; i < num_tasks; ++i) {
+    ompd_task_handle_t* task_handle;
+    functions->ompd_get_task_in_parallel(
+        ph, i, &task_handle);
+    return_handles.push_back(task_handle);
   }
-  free(task_handles);
   return return_handles;
 }
 
+static bool odbCheckThreadsInParallel(OMPDFunctionsPtr functions,
+                                      OMPDIcvsPtr icvs,
+                                      ompd_parallel_handle_t *ph,
+                                      vector<ompd_thread_handle_t*> thread_handles) {
+  ompd_rc_t ret;
+  bool check_passed = true;
+  int64_t icv_num_threads;
+  int64_t icv_level;
+
+  icvs->get(ph, "levels-var", &icv_level);
+
+  ret = icvs->get(ph, "ompd-team-size-var", &icv_num_threads);
+  if (ret != ompd_rc_ok) {
+    cout << "Error: could not retrieve icv 'ompd-team-size-var' (" << ret << ")" << endl;
+    return false;
+  }
+
+  OMPDThreadHandleCmp thread_cmp_op(functions);
+  std::set<ompd_thread_handle_t *, OMPDThreadHandleCmp> unique_thread_handles(thread_handles.begin(),
+                                                                              thread_handles.end(),
+                                                                              thread_cmp_op);
+
+  sout << "Checking parallel region with level " << icv_level << " and "
+       << icv_num_threads << " threads (overall " << unique_thread_handles.size()
+       << " associated threads)" << endl;
+
+  ompd_thread_handle_t *th;
+  for(int i = 0; i < icv_num_threads; i++) {
+    ret = functions->ompd_get_thread_in_parallel(ph, i, &th);
+    if (ret != ompd_rc_ok) {
+      cout << "Could not retrieve thread handle " << i << " in parallel (" << ret << ")" << endl;
+      check_passed = false;
+      continue;
+    }
+
+    auto matched_th = unique_thread_handles.find(th);
+    if (matched_th == unique_thread_handles.end()) {
+      cout << "Thread handle retrieved with ompd_get_thread_in_parallel doesn't match any thread associated with the parallel region (could already have been matched)" << endl;
+      check_passed = false;
+    } else {
+      sout << "Found matching thread for thread " << i << " in parallel region" << endl;
+      // we dont want a thread matched twice
+      unique_thread_handles.erase(matched_th);
+    }
+    functions->ompd_release_thread_handle(th);
+  }
+  return check_passed;
+}
+
 void OMPDTest::execute() const
-{ 
+{
 //  ompd_rc_t ret;
 
   if (extraArgs.empty() || extraArgs[0] == "help")
   {
     hout << "Test suites available: threads, parallel, tasks" << endl;
     return;
-  }     
+  }
 
   if (extraArgs[0] == "threads")
   {
@@ -564,7 +776,7 @@ void OMPDTest::execute() const
     {
       auto parallel_h = odbGetParallelRegions(functions, thr_h);
       auto task_h = odbGetTaskRegions(functions, thr_h);
-      
+
       sout << "Thread handle: 0x" << hex << thr_h << endl << "Parallel: ";
       for(auto ph: parallel_h)
       {
@@ -573,10 +785,12 @@ void OMPDTest::execute() const
         auto implicit_task_h = odbGetImplicitTasks(functions, ph);
         for(auto ith: implicit_task_h)
         {
+#if 0 //MARKER_MR: TODO: fix this
           uint64_t tid;
           functions->ompd_get_task_id(
                ith, &tid);
-          sout << "0x" << hex << ith << " (" << tid << "), ";
+#endif
+          sout << "0x" << hex << ith << " (" << "DISABLED IN ompd-devices" << "), ";
           functions->ompd_release_task_handle(ith);
         }
         sout << endl;
@@ -587,7 +801,7 @@ void OMPDTest::execute() const
       }
       sout << endl;
       pthread_t            osthread;
-      functions->ompd_get_osthread(thr_h, ompd_osthread_pthread, sizeof(pthread_t), &osthread);
+      functions->ompd_get_thread_id(thr_h, OMPD_THREAD_ID_PTHREAD, sizeof(pthread_t), &osthread);
       host_contextPool->getThreadContext(&osthread)->setThisGdbContext();
       odbCheckParallelIDs(functions, parallel_h);
       odbCheckTaskIDs(functions, task_h);
@@ -598,11 +812,271 @@ void OMPDTest::execute() const
       functions->ompd_release_thread_handle(thr_h);
     }
   }
+  else if (extraArgs[0]  == "parallel-threads")
+  {
+    // Checks if the thread handles returned by ompd_get_thread_in_parallel make sense
+    if (extraArgs.size() > 1) {
+      hout << "Usage: odb testapi parallel-threads" << endl;
+      return;
+    }
+
+    // Check host parallel regions
+    auto host_thread_handles = odbGetThreadHandles(addrhandle, functions);
+
+    OMPDParallelHandleCmp parallel_cmp_op(functions);
+    std::map<ompd_parallel_handle_t *,
+             std::vector<ompd_thread_handle_t *>,
+             OMPDParallelHandleCmp> host_parallel_handles(parallel_cmp_op);
+    for (auto t: host_thread_handles) {
+      for (auto parallel_handle: odbGetParallelRegions(functions, t))
+      {
+        host_parallel_handles[parallel_handle].push_back(t);
+      }
+    }
+
+    bool host_check_passed = true;
+    for (auto &ph_threads: host_parallel_handles) {
+      if (!odbCheckThreadsInParallel(functions, icvs, ph_threads.first, ph_threads.second)) {
+        host_check_passed = false;
+      }
+    }
+
+    cout << "Host check passed: " << host_check_passed << "\n" << endl;
+
+    for (auto ph: host_parallel_handles) {
+      functions->ompd_release_parallel_handle(ph.first);
+    }
+
+    for (auto th: host_thread_handles) {
+      functions->ompd_release_thread_handle(th);
+    }
 
+    //
+    // For Cuda devices
+    //
+    CudaGdb cuda;
+    auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle);
+    auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles);
+    std::map<ompd_parallel_handle_t *,
+             std::vector<ompd_thread_handle_t *>,
+             OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op);
+    for (auto t: cuda_thread_handles) {
+      for (auto p: odbGetParallelRegions(functions, t)) {
+        cuda_parallel_handles[p].push_back(t);
+      }
+    }
+
+    // For instantiation, it doesnt matter which device handle we use for
+    // OMPDIcvs, just use the first one
+
+   auto cudaIcvs = OMPDIcvsPtr(new OMPDIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle));
+
+    bool cuda_check_passed = true;
+    for (auto ph_threads: cuda_parallel_handles) {
+      if (!odbCheckThreadsInParallel(functions, cudaIcvs, ph_threads.first, ph_threads.second)) {
+        cuda_check_passed = false;
+      }
+    }
 
+    cout << "Cuda check passed: " << cuda_check_passed << endl;
+    return;
+  }
 }
 
 const char* OMPDTest::toString() const
 {
   return "odb api";
 }
+
+void OMPDParallelRegions::execute() const
+{
+  ompd_rc_t ret;
+
+  //
+  // For the host runtime
+  //
+  auto host_thread_handles = odbGetThreadHandles(addrhandle, functions);
+
+  OMPDParallelHandleCmp parallel_cmp_op(functions);
+  std::map<ompd_parallel_handle_t *,
+           std::vector<ompd_thread_handle_t *>,
+           OMPDParallelHandleCmp> host_parallel_handles(parallel_cmp_op);
+  for (auto t: host_thread_handles) {
+    for (auto parallel_handle: odbGetParallelRegions(functions, t))
+    {
+      host_parallel_handles[parallel_handle].push_back(t);
+    }
+  }
+
+  printf("HOST PARALLEL REGIONS\n");
+  printf("Parallel Handle   Num Threads   ICV Num Threads   ICV level   ICV active level\n");
+  printf("------------------------------------------------------------------------------\n");
+  for (auto &p: host_parallel_handles) {
+    ompd_word_t icv_num_threads, icv_level, icv_active_level;
+    icvs->get(p.first, "ompd-team-size-var", &icv_num_threads);
+    icvs->get(p.first, "levels-var", &icv_level);
+    icvs->get(p.first, "active-levels-var", &icv_active_level);
+    printf("%-15p   %-10zu   %-15ld   %-9ld   %ld\n", p.first, p.second.size(), icv_num_threads, icv_level, icv_active_level);
+  }
+
+  for (auto t: host_thread_handles) {
+    functions->ompd_release_thread_handle(t);
+  }
+  for (auto &p: host_parallel_handles) {
+    functions->ompd_release_parallel_handle(p.first);
+  }
+
+  //
+  // For Cuda devices
+  //
+  CudaGdb cuda;
+  auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle);
+  auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles);
+  std::map<ompd_parallel_handle_t *,
+           std::vector<ompd_thread_handle_t *>,
+           OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op);
+  for (auto t: cuda_thread_handles) {
+    for (auto p: odbGetParallelRegions(functions, t)) {
+      cuda_parallel_handles[p].push_back(t);
+    }
+  }
+
+  // For instantiation, it doesnt matter which device handle we use for
+  // OMPDIcvs, just use the first one
+
+  OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle);
+
+  printf("DEVICE PARALLEL REGIONS\n");
+  printf("Parallel Handle    Num Threads   ICV Num Threads   ICV level\n");
+  printf("------------------------------------------------------------\n");
+  for (auto &p: cuda_parallel_handles) {
+    ompd_word_t icv_level, icv_num_threads;
+    cudaIcvs.get(p.first, "ompd-team-size-var", &icv_num_threads);
+    cudaIcvs.get(p.first, "levels-var", &icv_level);
+    printf("%-15p   %-10zu   %-14ld   %ld\n", p.first, p.second.size(), icv_num_threads, icv_level);
+  }
+
+  for (auto t: cuda_thread_handles) {
+    functions->ompd_release_thread_handle(t);
+  }
+  for (auto &p: cuda_parallel_handles) {
+    functions->ompd_release_parallel_handle(p.first);
+  }
+  for (auto &d: cuda_device_handles) {
+    functions->ompd_release_address_space_handle(d.second.ompd_device_handle);
+  }
+}
+
+const char *OMPDParallelRegions::toString() const
+{
+  return "odb parallel";
+}
+
+void OMPDTasks::execute() const
+{
+  ompd_rc_t ret;
+  auto host_thread_handles = odbGetThreadHandles(addrhandle, functions);
+  OMPDTaskHandleCmp task_cmp_op(functions);
+  std::map<ompd_task_handle_t *,
+           std::vector<ompd_thread_handle_t *>,
+           OMPDTaskHandleCmp> host_task_handles(task_cmp_op);
+  for (auto t: host_thread_handles) {
+    for (auto task_handle: odbGetTaskRegions(functions, t)) {
+      host_task_handles[task_handle].push_back(t);
+    }
+  }
+
+  printf("HOST TASKS\n");
+  printf("Task Handle   Assoc. Threads   ICV Level   Enter Frame   Exit Frame   Task function\n");
+  printf("-----------------------------------------------------------------------------------\n");
+  for (auto th: host_task_handles) {
+    ompd_parallel_handle_t *ph;
+    ret = functions->ompd_get_task_parallel_handle(th.first, &ph);
+    if (ret != ompd_rc_ok) {
+      printf("could not get parallel handle for nesting\n");
+      continue;
+    }
+
+    ompd_word_t icv_level;
+    icvs->get(ph, "levels-var", &icv_level);
+
+    ompd_address_t enter_frame;
+    ompd_address_t exit_frame;
+    ret = functions->ompd_get_task_frame(th.first, &enter_frame, &exit_frame);
+    if (ret != ompd_rc_ok) {
+      printf("could not get task frame\n");
+      continue;
+    }
+
+    ompd_address_t task_function;
+    ret = functions->ompd_get_task_function(th.first, &task_function);
+    if (ret != ompd_rc_ok) {
+      printf("could not get task entry point\n");
+    }
+    printf("%-11p   %-14zu   %-9ld   %-11p   %-10p   %p\n", th.first,
+        th.second.size(), icv_level, (void*)enter_frame.address,
+        (void*)exit_frame.address, (void*)task_function.address);
+  }
+
+  for (auto task: host_task_handles) {
+    functions->ompd_release_task_handle(task.first);
+  }
+
+  for (auto thread: host_thread_handles) {
+    functions->ompd_release_thread_handle(thread);
+  }
+
+  // Cuda tasks
+  CudaGdb cuda;
+  auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle);
+  auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles);
+  std::map<ompd_task_handle_t *,
+           std::vector<ompd_thread_handle_t *>,
+           OMPDTaskHandleCmp> cuda_task_handles(task_cmp_op);
+  for (auto t: cuda_thread_handles) {
+    for (auto task_handle: odbGetTaskRegions(functions, t)) {
+      cuda_task_handles[task_handle].push_back(t);
+    }
+  }
+
+  printf("\nCUDA TASKS\n");
+  printf("Task Handle   Assoc. Threads   ICV Level   task function\n");
+  printf("--------------------------------------------------------\n");
+
+  // For instantiation, it doesnt matter which device handle we use for
+  // OMPDIcvs, just use the first one
+
+  OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle);
+
+  for (auto th: cuda_task_handles) {
+    ompd_parallel_handle_t *ph;
+    ret = functions->ompd_get_task_parallel_handle(th.first, &ph);
+    if (ret != ompd_rc_ok) {
+      printf("could not get parallel handle for nesting\n");
+      continue;
+    }
+
+    ompd_word_t icv_level;
+    cudaIcvs.get(ph, "levels-var", &icv_level);
+
+    ompd_address_t task_func_addr;
+    task_func_addr.address = 0;
+    functions->ompd_get_task_function(th.first, &task_func_addr);
+
+    printf("%-11p   %-14zu    %-8ld   %p\n", th.first, th.second.size(), icv_level, (void*)task_func_addr.address);
+    functions->ompd_release_parallel_handle(ph);
+  }
+
+  for (auto task: cuda_task_handles) {
+    functions->ompd_release_task_handle(task.first);
+  }
+
+  for (auto thread: cuda_thread_handles) {
+    functions->ompd_release_thread_handle(thread);
+  }
+}
+
+const char *OMPDTasks::toString() const
+{
+  return "odb tasks";
+}
diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h
index 04e8bf912..756658a69 100644
--- a/libompd/gdb-wrapper/OMPDCommand.h
+++ b/libompd/gdb-wrapper/OMPDCommand.h
@@ -28,28 +28,10 @@
 #include <memory>
 #include <vector>
 #include <string>
+#include <map>
 #include "ompd.h"
 #include "ompd_typedefs.h"
-#include "ompd_test.h"
-
-
-/*
- * The macro is used to create code to register all implemented ompd
- * API functions with the CommandFactory 
- * For new implemented API function just add a new OMPD_DO line
- */
-
-
-#define FOREACH_OMPD_CALLBACK_FN(macro) \
-macro(ompd_dmemory_alloc) \
-macro(ompd_dmemory_free) \
-macro(ompd_tsizeof_prim) \
-macro(ompd_tsymbol_addr) \
-macro(ompd_ttype) \
-macro(ompd_ttype_sizeof) \
-macro(ompd_ttype_offset) \
-macro(ompd_tmemory_access) \
-macro(ompd_print_string)
+//#include "ompd_test.h"
 
 #define FOREACH_OMPD_API_FN(macro) \
 macro(ompd_process_initialize) \
@@ -57,43 +39,28 @@ macro(ompd_device_initialize) \
 macro(ompd_release_address_space_handle) \
 macro(ompd_initialize) \
 macro(ompd_finalize) \
-macro(ompd_get_threads) \
 macro(ompd_get_thread_in_parallel) \
 macro(ompd_release_thread_handle) \
 macro(ompd_thread_handle_compare) \
-macro(ompd_get_top_parallel_region) \
+macro(ompd_get_thread_id) \
+macro(ompd_get_current_parallel_handle) \
 macro(ompd_get_enclosing_parallel_handle) \
-macro(ompd_get_task_enclosing_parallel_handle) \
+macro(ompd_get_task_parallel_handle) \
 macro(ompd_release_parallel_handle) \
 macro(ompd_parallel_handle_compare) \
-macro(ompd_get_top_task_region) \
-macro(ompd_get_ancestor_task_region) \
-macro(ompd_get_implicit_task_in_parallel) \
+macro(ompd_get_current_task_handle) \
+macro(ompd_get_generating_task_handle) \
+macro(ompd_get_task_in_parallel) \
 macro(ompd_release_task_handle) \
 macro(ompd_task_handle_compare) \
-macro(ompd_get_num_procs) \
-macro(ompd_get_thread_limit) \
-macro(ompd_get_num_threads) \
-macro(ompd_get_level) \
-macro(ompd_get_active_level) \
-macro(ompd_get_parallel_id) \
-macro(ompd_get_parallel_function) \
 macro(ompd_get_thread_handle) \
-macro(ompd_get_osthread) \
+macro(ompd_enumerate_states) \
 macro(ompd_get_state) \
-macro(ompd_get_max_threads) \
-macro(ompd_get_thread_num) \
-macro(ompd_in_parallel) \
-macro(ompd_in_final) \
-macro(ompd_get_dynamic) \
-macro(ompd_get_nested) \
-macro(ompd_get_max_active_levels) \
-macro(ompd_get_schedule) \
-macro(ompd_get_proc_bind) \
+macro(ompd_get_task_function) \
 macro(ompd_get_task_frame) \
-macro(ompd_get_task_id) \
-macro(ompd_get_version) \
-macro(ompd_get_version_string) \
+macro(ompd_get_api_version) \
+macro(ompd_enumerate_icvs) \
+macro(ompd_get_icv_from_scope) \
 
 
 namespace ompd_gdb {
@@ -119,23 +86,72 @@ typedef struct
 FOREACH_OMPD_API_FN(OMPD_API_FUNCTION_POINTER_MEMBER)
 #undef OMPD_API_FUNCTION_POINTER_MEMBER
 
-/*  ompd_rc_t (*ompd_initialize) (ompd_callbacks_t *) = nullptr;
-  ompd_get_thread_handle_fn_t ompd_get_thread_handle = nullptr;
-  ompd_nesting_level_fn_t ompd_nesting_level = nullptr;
-  ompd_read_tmemory_fn_t ompd_read_tmemory = nullptr;
-*/
-
 } OMPDFunctions;
 
 typedef std::shared_ptr<OMPDFunctions> OMPDFunctionsPtr;
 
+class OMPDIcvs
+{
+private:
+  OMPDFunctionsPtr functions;
+  std::map<std::string, std::pair<ompd_icv_id_t, ompd_scope_t>> availableIcvs;
+public:
+  OMPDIcvs(OMPDFunctionsPtr functions,
+           ompd_address_space_handle_t *addrhandle);
+  ompd_rc_t get(ompd_parallel_handle_t *handle, const char *name,
+                ompd_word_t *value);
+};
+
+typedef std::shared_ptr<OMPDIcvs> OMPDIcvsPtr;
+
+class OMPDParallelHandleCmp
+{
+  OMPDFunctionsPtr functions;
+public:
+  OMPDParallelHandleCmp(const OMPDFunctionsPtr &f)
+    : functions(f) {}
+  bool operator()(ompd_parallel_handle_t *a, ompd_parallel_handle_t *b) {
+    int cmp = 0;
+    functions->ompd_parallel_handle_compare(a, b, &cmp);
+    return cmp < 0;
+  }
+};
+
+class OMPDThreadHandleCmp
+{
+  OMPDFunctionsPtr functions;
+public:
+  OMPDThreadHandleCmp(const OMPDFunctionsPtr &f)
+    : functions(f) {}
+  bool operator()(ompd_thread_handle_t *a, ompd_thread_handle_t *b) {
+    int cmp = 0;
+    functions->ompd_thread_handle_compare(a, b, &cmp);
+    return cmp < 0;
+  }
+};
+
+class OMPDTaskHandleCmp
+{
+  OMPDFunctionsPtr functions;
+public:
+  OMPDTaskHandleCmp(const OMPDFunctionsPtr &f)
+    : functions(f) {}
+  bool operator()(ompd_task_handle_t *a, ompd_task_handle_t *b) {
+    int cmp = 0;
+    functions->ompd_task_handle_compare(a, b, &cmp);
+    return cmp < 0;
+  }
+};
+
 class OMPDCommand;
 
 class OMPDCommandFactory
 {
 private:
   void * findFunctionInLibrary(const char *fun) const;
+  void initOmpd();
   OMPDFunctionsPtr functions = nullptr;
+  OMPDIcvsPtr icvs = nullptr;
 //   ompd_process_handle_t* prochandle = nullptr;
   ompd_address_space_handle_t* addrhandle = nullptr;
   OutputString out;
@@ -144,7 +160,7 @@ class OMPDCommandFactory
   OMPDCommandFactory();
   ~OMPDCommandFactory();
 //  OMPDCommand* create(const char *str) const;
-  OMPDCommand* create(const char *str, const std::vector<std::string>& extraArgs=std::vector<std::string>()) const;
+  OMPDCommand* create(const char *str, const std::vector<std::string>& extraArgs=std::vector<std::string>());
 };
 
 typedef std::unique_ptr<OMPDCommandFactory> OMPDCommandFactoryPtr;
@@ -227,12 +243,14 @@ class OMPDThreads : public OMPDCommand
 
 class OMPDLevels : public OMPDCommand
 {
+  OMPDIcvsPtr icvs;
 public:
   ~OMPDLevels(){};
   void execute() const;
   const char* toString() const;
 protected:
-  OMPDLevels(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const std::vector<std::string>& args) : OMPDCommand(f, ah, args){};
+  OMPDLevels(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const OMPDIcvsPtr &icvs, const std::vector<std::string>& args)
+      : OMPDCommand(f, ah, args), icvs(icvs) {};
 
   friend OMPDCommandFactory;
 };
@@ -268,9 +286,46 @@ class OMPDTest : public OMPDCommand
   void execute() const;
   const char* toString() const;
 protected:
-  OMPDTest(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const std::vector<std::string>& args) : OMPDCommand(f, ah, args){};
+  OMPDTest(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah,
+           const OMPDIcvsPtr &icvs, const std::vector<std::string>& args)
+    : OMPDCommand(f, ah, args), icvs(icvs) {};
 
   friend OMPDCommandFactory;
+private:
+  OMPDIcvsPtr icvs;
+};
+
+class OMPDParallelRegions : public OMPDCommand
+{
+public:
+  ~OMPDParallelRegions() {};
+  void execute() const;
+  const char *toString() const;
+protected:
+  OMPDParallelRegions(const OMPDFunctionsPtr &f,
+                      ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs,
+                      const std::vector<std::string>& args)
+    : OMPDCommand(f, ah, args), icvs(icvs) {}
+
+  friend OMPDCommandFactory;
+private:
+  OMPDIcvsPtr icvs;
+};
+
+class OMPDTasks : public OMPDCommand
+{
+public:
+  ~OMPDTasks() {}
+  void execute() const;
+  const char *toString() const;
+protected:
+  OMPDTasks(const OMPDFunctionsPtr &f,
+            ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs,
+            const std::vector<std::string>& args)
+    : OMPDCommand(f, ah, args), icvs(icvs) {}
+  friend OMPDCommandFactory;
+private:
+  OMPDIcvsPtr icvs;
 };
 
 }
diff --git a/libompd/gdb-wrapper/OMPDContext.cpp b/libompd/gdb-wrapper/OMPDContext.cpp
index 9b92e0d13..b344a1de4 100644
--- a/libompd/gdb-wrapper/OMPDContext.cpp
+++ b/libompd/gdb-wrapper/OMPDContext.cpp
@@ -139,7 +139,24 @@ ompd_thread_context_t * OMPDHostContext::getContextForThread(gdb_thread_id& thr_
 
 bool OMPDCudaContext::setThisGdbContext()
 {
-  bool ret = false;
+  bool ret = true;
+  stringstream device_command;
+  stringstream coord_command;
+  device_command << "cuda device " << this->cudathread->coord.cudaDevId;
+  coord_command << "cuda grid " << this->cudathread->coord.gridId
+                << " block " << this->cudathread->coord.blockIdx.x
+                << " thread " << this->cudathread->coord.threadIdx.x;
+  OMPDContextPool::gdb->writeInput(device_command.str().c_str());
+  string gdbOut = OMPDContextPool::gdb->readOutput();
+  if (gdbOut.find("cannot be satisfied") != 0)
+    ret = false;
+
+  OMPDContextPool::gdb->writeInput(coord_command.str().c_str());
+  gdbOut = OMPDContextPool::gdb->readOutput();
+  if (gdbOut.find("cannot be satisfied") != 0)
+    ret = false;
+
+#if 0
   stringstream command;
   command 
 #ifdef HACK_FOR_CUDA_GDB
@@ -154,6 +171,7 @@ bool OMPDCudaContext::setThisGdbContext()
   string gdbOut = OMPDContextPool::gdb->readOutput();
   if (gdbOut.find("not known")==0)
     ret = true;
+#endif
   return ret;
 }
 
diff --git a/libompd/gdb-wrapper/OMPDContext.h b/libompd/gdb-wrapper/OMPDContext.h
index be3142439..89793543e 100644
--- a/libompd/gdb-wrapper/OMPDContext.h
+++ b/libompd/gdb-wrapper/OMPDContext.h
@@ -16,7 +16,7 @@
  */
 
 #include "ompd.h"
-#include "ompd_test.h"
+//#include "ompd_test.h"
 #include "GdbProcess.h"
 #include "Callbacks.h"
 #include "CudaGdb.h"
diff --git a/libompd/gdb-wrapper/StringParser.cpp b/libompd/gdb-wrapper/StringParser.cpp
index 0df120459..e3ebf3f93 100644
--- a/libompd/gdb-wrapper/StringParser.cpp
+++ b/libompd/gdb-wrapper/StringParser.cpp
@@ -224,7 +224,7 @@ vector<CudaThread> StringParser::matchCudaThreadsInfo(
   coord.gridId = grid;
   coord.cudaContext = ctx;
   coord.cudaDevId = dev;
-  coord.kernelId = kernel;
+  coord.warpSize = 0;
 
   for (int b = 0; b < threadcounts.size(); ++b) {
     coord.blockIdx.x = b;
diff --git a/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake b/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake
new file mode 100644
index 000000000..14613ae4f
--- /dev/null
+++ b/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake
@@ -0,0 +1,60 @@
+# - Try to find GDB
+#
+# Once done, this will define:
+#  CUDA_GDB_FOUND - system has CUDA_GDB
+#  CUDA_GDB_COMMAND - the command to run
+#  CUDA_GDB_VERSION - version
+#  CUDA_GDB_HAS_RETURN_CHILD_RESULT - if the --return-child-result flag is supported
+#
+# Useful configuration variables you might want to add to your cache:
+#  CUDA_GDB_ROOT_DIR - A directory prefix to search
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+
+set(CUDA_GDB_ROOT_DIR
+        "${CUDA_GDB_ROOT_DIR}"
+	CACHE
+	PATH
+	"Directory to start our search in")
+
+find_program(CUDA_GDB_COMMAND
+	NAMES
+	cuda-gdb
+	HINTS
+        "${CUDA_GDB_ROOT_DIR}"
+	PATH_SUFFIXES
+	bin
+	libexec)
+
+if(CUDA_GDB_COMMAND)
+	execute_process(COMMAND cuda-gdb --version
+		COMMAND head -n 1
+                OUTPUT_VARIABLE CUDA_GDB_VERSION
+		OUTPUT_STRIP_TRAILING_WHITESPACE)
+        string(REGEX REPLACE "[^0-9]*([0-9]+[0-9.]*).*" "\\1" CUDA_GDB_VERSION "${CUDA_GDB_VERSION}")
+endif()
+
+# handle the QUIETLY and REQUIRED arguments and set xxx_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDA_GDB DEFAULT_MSG CUDA_GDB_COMMAND CUDA_GDB_VERSION)
+
+if(CUDA_GDB_FOUND)
+        mark_as_advanced(CUDA_GDB_ROOT_DIR)
+        if(CUDA_GDB_VERSION VERSION_LESS 6.4)
+                set(CUDA_GDB_HAS_RETURN_CHILD_RESULT FALSE)
+	else()
+                set(CUDA_GDB_HAS_RETURN_CHILD_RESULT TRUE)
+	endif()
+endif()
+
+mark_as_advanced(CUDA_GDB_COMMAND)
diff --git a/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake b/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake
new file mode 100644
index 000000000..a5f743da6
--- /dev/null
+++ b/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake
@@ -0,0 +1,60 @@
+# - Try to find GDB
+#
+# Once done, this will define:
+#  GDB_FOUND - system has GDB
+#  GDB_COMMAND - the command to run
+#  GDB_VERSION - version
+#  GDB_HAS_RETURN_CHILD_RESULT - if the --return-child-result flag is supported
+#
+# Useful configuration variables you might want to add to your cache:
+#  GDB_ROOT_DIR - A directory prefix to search
+#
+# Original Author:
+# 2009-2010 Ryan Pavlik <rpavlik@iastate.edu> <abiryan@ryand.net>
+# http://academic.cleardefinition.com
+# Iowa State University HCI Graduate Program/VRAC
+#
+# Copyright Iowa State University 2009-2010.
+# Distributed under the Boost Software License, Version 1.0.
+# (See accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+
+set(GDB_ROOT_DIR
+	"${GDB_ROOT_DIR}"
+	CACHE
+	PATH
+	"Directory to start our search in")
+
+find_program(GDB_COMMAND
+	NAMES
+	gdb
+	HINTS
+	"${GDB_ROOT_DIR}"
+	PATH_SUFFIXES
+	bin
+	libexec)
+
+if(GDB_COMMAND)
+	execute_process(COMMAND gdb --version
+		COMMAND head -n 1
+		OUTPUT_VARIABLE GDB_VERSION
+		OUTPUT_STRIP_TRAILING_WHITESPACE)
+	string(REGEX REPLACE "[^0-9]*([0-9]+[0-9.]*).*" "\\1" GDB_VERSION "${GDB_VERSION}")
+endif()
+
+# handle the QUIETLY and REQUIRED arguments and set xxx_FOUND to TRUE if
+# all listed variables are TRUE
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GDB DEFAULT_MSG GDB_COMMAND GDB_VERSION)
+
+if(GDB_FOUND)
+	mark_as_advanced(GDB_ROOT_DIR)
+	if(GDB_VERSION VERSION_LESS 6.4)
+		set(GDB_HAS_RETURN_CHILD_RESULT FALSE)
+	else()
+		set(GDB_HAS_RETURN_CHILD_RESULT TRUE)
+	endif()
+endif()
+
+mark_as_advanced(GDB_COMMAND)
diff --git a/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake b/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake
new file mode 100644
index 000000000..745cfe583
--- /dev/null
+++ b/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake
@@ -0,0 +1,47 @@
+# - Try to find readline include dirs and libraries 
+#
+# Usage of this module as follows:
+#
+#     find_package(Readline)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Readline_ROOT_DIR         Set this variable to the root installation of
+#                            readline if the module has problems finding the
+#                            proper installation path.
+#
+# Variables defined by this module:
+#
+#  READLINE_FOUND            System has readline, include and lib dirs found
+#  Readline_INCLUDE_DIR      The readline include directories. 
+#  Readline_LIBRARY          The readline library.
+
+find_path(Readline_ROOT_DIR
+    NAMES include/readline/readline.h
+)
+
+find_path(Readline_INCLUDE_DIR
+    NAMES readline/readline.h
+    HINTS ${Readline_ROOT_DIR}/include
+)
+
+find_library(Readline_LIBRARY
+    NAMES readline
+    HINTS ${Readline_ROOT_DIR}/lib
+)
+
+if(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY)
+  set(READLINE_FOUND TRUE)
+else(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY)
+  FIND_LIBRARY(Readline_LIBRARY NAMES readline)
+  include(FindPackageHandleStandardArgs)
+  FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG Readline_INCLUDE_DIR Readline_LIBRARY )
+  MARK_AS_ADVANCED(Readline_INCLUDE_DIR Readline_LIBRARY)
+endif(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY)
+
+mark_as_advanced(
+    Readline_ROOT_DIR
+    Readline_INCLUDE_DIR
+    Readline_LIBRARY
+)
diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h
new file mode 100644
index 000000000..825916434
--- /dev/null
+++ b/libompd/gdb-wrapper/ompd_typedefs.h
@@ -0,0 +1,200 @@
+#include "ompd.h"
+
+
+/*  4.3.4.1
+ * Global initialization and finalization
+ */
+
+
+typedef ompd_rc_t (*ompd_initialize_fn_t) (
+  ompd_word_t api_version,
+  const ompd_callbacks_t *callbacks
+);
+
+typedef ompd_rc_t (*ompd_get_api_version_fn_t) (
+  ompd_word_t *version
+);
+
+typedef ompd_rc_t (*ompd_get_version_string_fn_t) (
+  const char **string
+);
+
+typedef ompd_rc_t (*ompd_finalize_fn_t) (void);
+
+/* 4.3.4.2
+ * Per OpenMP Process Initialiyation and Finalization
+ */
+
+typedef ompd_rc_t (*ompd_process_initialize_fn_t) (
+    ompd_address_space_context_t *context,
+    ompd_address_space_handle_t **handle
+  );
+
+typedef ompd_rc_t (*ompd_device_initialize_fn_t) (
+    ompd_address_space_handle_t *process_handle, /*IN: address space of the OpenMP process*/
+    ompd_address_space_context_t *device_context, /*IN: Opaque tool handle for device address space*/
+    ompd_device_t kind, /*IN: device identifier kind*/
+    ompd_size_t sizeof_id, /*IN: size of device identifier*/
+    void *id, /*IN: device identifier*/
+    ompd_address_space_handle_t **device_handle /*OUT: device handle*/
+  );
+
+
+typedef ompd_rc_t (*ompd_release_address_space_handle_fn_t) (
+    ompd_address_space_handle_t *addr_handle    /* IN: handle for the address space */
+  );
+
+/* 4.3.4.4
+ * Address space information
+ */
+
+typedef ompd_rc_t (*ompd_get_omp_version_fn_t) (
+    ompd_address_space_handle_t *address_space,
+    ompd_word_t *omp_version
+  );
+
+typedef ompd_rc_t (*ompd_get_omp_version_string_fn_t) (
+    ompd_address_space_handle_t *address_space,
+    const char **string
+  );
+
+/* 4.3.4.5
+ * Thread Handles
+ */
+
+typedef ompd_rc_t (*ompd_get_thread_in_parallel_fn_t) (
+    ompd_parallel_handle_t *parallel_handle, /*IN: handle for the parallel region*/
+    int thread_num, /*IN: the nubmer of the thread that is returned*/
+    ompd_thread_handle_t **thread_hanlde /*OUT: returned thread handle*/
+  );
+
+
+typedef ompd_rc_t (*ompd_get_thread_handle_fn_t) (
+    ompd_address_space_handle_t *addr_handle,    /* IN: handle for the address space */
+    ompd_thread_id_t         kind,
+    ompd_size_t                  sizeof_osthread,
+    const void*                  osthread,
+    ompd_thread_handle_t       **thread_handle     /* OUT: OpenMP thread handle*/
+  );
+
+typedef ompd_rc_t (*ompd_release_thread_handle_fn_t) (
+    ompd_thread_handle_t *thread_handle
+);
+
+typedef ompd_rc_t (*ompd_thread_handle_compare_fn_t) (
+    ompd_thread_handle_t *thread_handle_1,
+    ompd_thread_handle_t *thread_handle_2,
+    int *cmp_value
+);
+
+typedef ompd_rc_t (*ompd_get_thread_id_fn_t) (
+    ompd_thread_handle_t *thread_handle,
+    ompd_thread_id_t kind,
+    ompd_size_t sizeof_thread_id,
+    void *thread_id
+  );
+
+/* 4.3.4.6
+ * Parallel Region Handles
+ */
+
+typedef ompd_rc_t (*ompd_get_current_parallel_handle_fn_t) (
+    ompd_thread_handle_t *thread_handle,
+    ompd_parallel_handle_t **parallel_handle
+  );
+
+typedef ompd_rc_t (*ompd_get_enclosing_parallel_handle_fn_t) (
+    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
+    ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */
+  );
+
+typedef ompd_rc_t (*ompd_get_task_parallel_handle_fn_t) (
+    ompd_task_handle_t *task_handle,
+    ompd_parallel_handle_t **task_parallel_handle
+  );
+
+typedef ompd_rc_t (*ompd_release_parallel_handle_fn_t) (
+    ompd_parallel_handle_t *parallel_handle
+  );
+
+typedef ompd_rc_t (*ompd_parallel_handle_compare_fn_t) (
+    ompd_parallel_handle_t *parallel_handle_1,
+    ompd_parallel_handle_t *parallel_handle_2,
+    int                    *cmp_value
+  );
+
+/* 4.3.4.7
+ * Task Handles
+ */
+
+typedef ompd_rc_t (*ompd_get_current_task_handle_fn_t) (
+    ompd_thread_handle_t *thread_handle,
+    ompd_task_handle_t **task_handle
+  );
+
+typedef ompd_rc_t (*ompd_get_generating_task_handle_fn_t) (
+    ompd_task_handle_t *task_handle,
+    ompd_task_handle_t **generating_task_handle
+  );
+
+typedef ompd_rc_t (*ompd_get_scheduling_task_handle_fn_t) (
+    ompd_task_handle_t *task_handle,
+    ompd_task_handle_t **scheduling_task_handle
+  );
+
+typedef ompd_rc_t (*ompd_get_task_in_parallel_fn_t) (
+    ompd_parallel_handle_t *parallel_handle,
+    int thread_num,
+    ompd_task_handle_t **task_handle
+  );
+
+typedef ompd_rc_t (*ompd_release_task_handle_fn_t) (
+    ompd_task_handle_t *task_handle
+);
+
+typedef ompd_rc_t (*ompd_task_handle_compare_fn_t) (
+    ompd_task_handle_t *task_handle_1,
+    ompd_task_handle_t *task_handle_2,
+    int                *cmp_value
+);
+
+typedef ompd_rc_t (*ompd_get_task_function_fn_t) (
+    ompd_task_handle_t *task_handle,
+    ompd_address_t *entry_point
+  );
+
+typedef ompd_rc_t (*ompd_get_task_frame_fn_t) (
+    ompd_task_handle_t *task_handle,
+    ompd_address_t *exit_frame,
+    ompd_address_t *enter_frame
+  );
+
+typedef ompd_rc_t (*ompd_enumerate_states_fn_t) (
+    ompd_address_space_handle_t *address_space_handle,
+    ompd_word_t current_state,
+    ompd_word_t *next_state,
+    const char **next_state_name,
+    ompd_word_t *more_enums
+  );
+
+typedef ompd_rc_t (*ompd_get_state_fn_t) (
+    ompd_thread_handle_t *thread_handle,     /* IN: OpenMP thread handle*/
+    ompd_word_t *state,                    /* OUT: State of this thread */
+    ompd_wait_id_t *wait_id                 /* OUT: Wait ID */
+  );
+
+typedef ompd_rc_t (*ompd_enumerate_icvs_fn_t) (
+    ompd_address_space_handle_t *handle,
+    ompd_icv_id_t current,
+    ompd_icv_id_t *next_id,
+    const char **next_icv_name,
+    ompd_scope_t *next_scope,
+    int *more
+  );
+
+typedef ompd_rc_t (*ompd_get_icv_from_scope_fn_t) (
+    void *handle,
+    ompd_scope_t scope,
+    ompd_icv_id_t icv_id,
+    ompd_word_t *icv_value
+  );
diff --git a/libompd/src/CMakeLists.txt b/libompd/src/CMakeLists.txt
index 0fb4e6b0f..5ffc44035 100644
--- a/libompd/src/CMakeLists.txt
+++ b/libompd/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 project (libompd)
 
-add_library (ompd SHARED TargetValue.cpp omp-debug.cpp)
+add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp omp-icv.cpp)
 
 add_dependencies(ompd omp) # ensure generated import library is created first
 
diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp
index 43a394f83..d1de522fd 100644
--- a/libompd/src/TargetValue.cpp
+++ b/libompd/src/TargetValue.cpp
@@ -6,9 +6,14 @@
 #include <sstream>
 
 const ompd_callbacks_t *TValue::callbacks = NULL;
-ompd_target_type_sizes_t TValue::type_sizes;
+ompd_device_type_sizes_t TValue::type_sizes;
 
+// MARKER_MR: This is just compat stuff because I dont have time to
+// replace this function. TODO: replace this function
 inline int ompd_sizeof(ompd_target_prim_types_t t) {
+  assert(t != ompd_type_max && "ompd_type_max should not be used anywhere");
+  assert(t != ompd_type_invalid && "request size of invalid type");
+
   return (((char *)&TValue::type_sizes)[(int)t]);
 }
 
@@ -44,7 +49,14 @@ ompd_rc_t TType::getSize(ompd_size_t *size) {
     ompd_size_t tmpSize;
     std::stringstream ss;
     ss << "ompd_sizeof__" << typeName;
-    ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(),
+
+    // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr)
+    if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL ||
+        descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) {
+      ss << "_";
+    }
+
+    ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(),
                                           &symbolAddr);
     if (ret != ompd_rc_ok) {
       dout << "missing symbol " << ss.str()
@@ -52,15 +64,25 @@ ompd_rc_t TType::getSize(ompd_size_t *size) {
            << ") \\" << std::endl;
       return ret;
     }
+
     symbolAddr.segment = descSegment;
 
-    ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr,
-                                          1 * ompd_sizeof(ompd_type_long_long),
+    // On cuda targets, ompd_sizeof_  and ompd_access_ symbols are alwazs in
+    // shared memory.
+    // This is a hack to ensure that we are not looking in global memory for
+    // it
+    // TODO (mr): Find a better solution
+    if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) {
+      symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED;
+    }
+
+    ret = TValue::callbacks->read_memory(context, NULL, symbolAddr,
+                                          1 * TValue::type_sizes.sizeof_long_long,
                                           &(tmpSize));
     if (ret != ompd_rc_ok)
       return ret;
-    ret = TValue::callbacks->target_to_host(
-        context, &tmpSize, ompd_sizeof(ompd_type_long_long), 1, &(typeSize));
+    ret = TValue::callbacks->device_to_host(
+        context, &tmpSize, TValue::type_sizes.sizeof_long_long, 1, &(typeSize));
   }
   *size = typeSize;
   return ret;
@@ -77,7 +99,7 @@ ompd_rc_t TType::getBitfieldMask(const char *fieldName,
     //    &fieldOffset);
     std::stringstream ss;
     ss << "ompd_bitfield__" << typeName << "__" << fieldName;
-    ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(),
+    ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(),
                                           &symbolAddr);
     if (ret != ompd_rc_ok) {
       dout << "missing symbol " << ss.str()
@@ -87,14 +109,14 @@ ompd_rc_t TType::getBitfieldMask(const char *fieldName,
     }
     symbolAddr.segment = descSegment;
 
-    ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr,
-                                          1 * ompd_sizeof(ompd_type_long_long),
+    ret = TValue::callbacks->read_memory(context, NULL, symbolAddr,
+                                          1 * TValue::type_sizes.sizeof_long_long,
                                           &(tmpMask));
     if (ret != ompd_rc_ok)
       return ret;
-    ret = TValue::callbacks->target_to_host(context, &(tmpMask),
-                                            ompd_sizeof(ompd_type_long_long), 1,
-                                            &(bitfieldMask));
+    ret = TValue::callbacks->device_to_host(context, &(tmpMask),
+                                            TValue::type_sizes.sizeof_long_long,
+                                            1, &(bitfieldMask));
     if (ret != ompd_rc_ok) {
       return ret;
     }
@@ -114,7 +136,14 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) {
     //    &fieldOffset);
     std::stringstream ss;
     ss << "ompd_access__" << typeName << "__" << fieldName;
-    ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(),
+
+    // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr)
+    if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL ||
+        descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) {
+      ss << "_";
+    }
+
+    ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(),
                                           &symbolAddr);
     if (ret != ompd_rc_ok) {
       dout << "missing symbol " << ss.str()
@@ -124,14 +153,23 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) {
     }
     symbolAddr.segment = descSegment;
 
-    ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr,
-                                          1 * ompd_sizeof(ompd_type_long_long),
+    // On cuda targets, ompd_sizeof_  and ompd_access_ symbols are alwazs in
+    // shared memory.
+    // This is a hack to ensure that we are not looking in global memory for
+    // it
+    // TODO (mr): Find a better solution
+    if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) {
+      symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED;
+    }
+
+    ret = TValue::callbacks->read_memory(context, NULL, symbolAddr,
+                                          1 * TValue::type_sizes.sizeof_long_long,
                                           &(tmpOffset));
     if (ret != ompd_rc_ok)
       return ret;
-    ret = TValue::callbacks->target_to_host(context, &(tmpOffset),
-                                            ompd_sizeof(ompd_type_long_long), 1,
-                                            &fieldOffset);
+    ret = TValue::callbacks->device_to_host(context, &(tmpOffset),
+                                            TValue::type_sizes.sizeof_long_long,
+                                            1, &fieldOffset);
     if (ret != ompd_rc_ok) {
       return ret;
     }
@@ -151,7 +189,14 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) {
     //    &fieldOffset);
     std::stringstream ss;
     ss << "ompd_sizeof__" << typeName << "__" << fieldName;
-    ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(),
+    
+    // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr)
+    if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL ||
+        descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) {
+      ss << "_";
+    }
+
+    ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(),
                                           &symbolAddr);
     if (ret != ompd_rc_ok) {
       dout << "missing symbol " << ss.str()
@@ -161,13 +206,23 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) {
     }
     symbolAddr.segment = descSegment;
 
-    ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr,
-                                          1 * ompd_sizeof(ompd_type_long_long),
+    // On cuda targets, ompd_sizeof_  and ompd_access_ symbols are alwazs in
+    // shared memory.
+    // This is a hack to ensure that we are not looking in global memory for
+    // it
+    // TODO (mr): Find a better solution
+    if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) {
+      symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED;
+    }
+
+    ret = TValue::callbacks->read_memory(context, NULL, symbolAddr,
+                                          1 * TValue::type_sizes.sizeof_long_long,
                                           &(tmpOffset));
     if (ret != ompd_rc_ok)
       return ret;
-    ret = TValue::callbacks->target_to_host(
-        context, &tmpOffset, ompd_sizeof(ompd_type_long_long), 1, &fieldSize);
+    ret = TValue::callbacks->device_to_host(context, &tmpOffset,
+                                            TValue::type_sizes.sizeof_long_long,
+                                            1, &fieldSize);
     if (ret != ompd_rc_ok) {
       return ret;
     }
@@ -209,7 +264,7 @@ TValue::TValue(ompd_address_space_context_t *_context,
       /*valueName(_valueName),*/ context(_context), tcontext(_tcontext),
       fieldSize(0) {
   errorState.errorCode =
-      callbacks->tsymbol_addr(context, tcontext, _valueName, &symbolAddr);
+      callbacks->symbol_addr_lookup(context, tcontext, _valueName, &symbolAddr);
   symbolAddr.segment = segment;
   //  assert((ret==ompd_rc_ok) && "Callback call failed");
 }
@@ -257,14 +312,14 @@ TValue TValue::dereference() const {
   assert(pointerLevel > 0 && "cannot dereference non-pointer");
   TValue ret = *this;
   ret.pointerLevel--;
-  ret.errorState.errorCode = callbacks->read_tmemory(
-      context, tcontext, symbolAddr, 1 * ompd_sizeof(ompd_type_pointer),
+  ret.errorState.errorCode = callbacks->read_memory(
+      context, tcontext, symbolAddr, 1 * TValue::type_sizes.sizeof_pointer,
       &(tmpAddr.address));
   if (ret.errorState.errorCode != ompd_rc_ok)
     return ret;
 
-  ret.errorState.errorCode = callbacks->target_to_host(
-      context, &(tmpAddr.address), ompd_sizeof(ompd_type_pointer), 1,
+  ret.errorState.errorCode = callbacks->device_to_host(
+      context, &(tmpAddr.address), TValue::type_sizes.sizeof_pointer, 1,
       &(ret.symbolAddr.address));
   if (ret.errorState.errorCode != ompd_rc_ok) {
     return ret;
@@ -290,7 +345,7 @@ ompd_rc_t TValue::getRawValue(void *buf, int count) {
     return errorState.errorCode;
 
   errorState.errorCode =
-      callbacks->read_tmemory(context, tcontext, symbolAddr, size, buf);
+      callbacks->read_memory(context, tcontext, symbolAddr, size, buf);
   return errorState.errorCode;
 }
 
@@ -307,7 +362,11 @@ TBaseValue TValue::castBase(const char *varName) {
   return TBaseValue(*this, size);
 }
 
-TBaseValue TValue::castBase() const { return TBaseValue(*this, fieldSize); }
+TBaseValue TValue::castBase() const { 
+	if(pointerLevel>0)
+		return TBaseValue(*this, type_sizes.sizeof_pointer); 
+	return TBaseValue(*this, fieldSize); 
+}
 
 TBaseValue TValue::castBase(ompd_target_prim_types_t baseType) const {
   return TBaseValue(*this, baseType);
@@ -345,7 +404,12 @@ ompd_rc_t TValue::check(const char *bitfieldName, ompd_word_t *isSet) const {
 TValue TValue::getArrayElement(int elemNumber) const {
   if (gotError())
     return *this;
-  TValue ret = dereference();
+  TValue ret;
+  if (pointerLevel > 0) { 
+    ret = dereference();
+  } else {
+    ret = *this;
+  }
   if (ret.pointerLevel == 0) {
     ompd_size_t size;
     ret.errorState.errorCode = type->getSize(&size);
@@ -356,6 +420,16 @@ TValue TValue::getArrayElement(int elemNumber) const {
   return ret;
 }
 
+TValue TValue::getPtrArrayElement(int elemNumber) const {
+  if (gotError()) {
+    return *this;
+  }
+  assert(pointerLevel > 0 && "This only works on arrays of pointers");
+  TValue ret = *this;
+  ret.symbolAddr.address += elemNumber * type_sizes.sizeof_pointer;
+  return ret;
+}
+
 TBaseValue::TBaseValue(const TValue &_tvalue,
                        ompd_target_prim_types_t _baseType)
     : TValue(_tvalue), baseTypeSize(ompd_sizeof(_baseType)) {}
@@ -365,12 +439,12 @@ TBaseValue::TBaseValue(const TValue &_tvalue, ompd_size_t _baseTypeSize)
 ompd_rc_t TBaseValue::getValue(void *buf, int count) {
   if (errorState.errorCode != ompd_rc_ok)
     return errorState.errorCode;
-  errorState.errorCode = callbacks->read_tmemory(context, tcontext, symbolAddr,
+  errorState.errorCode = callbacks->read_memory(context, tcontext, symbolAddr,
                                                  count * baseTypeSize, buf);
   if (errorState.errorCode != ompd_rc_ok)
     return errorState.errorCode;
   errorState.errorCode =
-      callbacks->target_to_host(context, buf, baseTypeSize, count, buf);
+      callbacks->device_to_host(context, buf, baseTypeSize, count, buf);
   return errorState.errorCode;
 }
 
@@ -378,7 +452,7 @@ ompd_rc_t TBaseValue::getValue(void *buf, int count) {
 // {
 //   if( errorState.errorCode != ompd_rc_ok )
 //     return errorState.errorCode;
-//   errorState.errorCode = callbacks->read_tmemory(context, tcontext,
+//   errorState.errorCode = callbacks->read_memory(context, tcontext,
 //   symbolAddr,
 //       count, baseType, &(buf->th));
 //   assert((errorState.errorCode == ompd_rc_ok) && "Callback call failed");
diff --git a/libompd/src/TargetValue.h b/libompd/src/TargetValue.h
index cbf8a4f9f..cf14ea716 100644
--- a/libompd/src/TargetValue.h
+++ b/libompd/src/TargetValue.h
@@ -1,5 +1,6 @@
 
 #include "ompd.h"
+#include "ompd-private.h"
 #include <stdlib.h>
 
 #ifndef SRC_TARGET_VALUE_H_
@@ -100,7 +101,7 @@ class TValue {
 
 public:
   static const ompd_callbacks_t *callbacks;
-  static ompd_target_type_sizes_t type_sizes;
+  static ompd_device_type_sizes_t type_sizes;
 
   TValue() : errorState(ompd_rc_error) {}
   /**
@@ -185,6 +186,10 @@ class TValue {
    * Get an array element
    */
   TValue getArrayElement(int elemNumber) const;
+  /**
+   * Get an element of a pointer arraz
+   */
+  TValue getPtrArrayElement(int elemNumber) const;
   /**
    * Did we raise some error yet?
    */
diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp
index 325872273..fae2b63cc 100644
--- a/libompd/src/omp-debug.cpp
+++ b/libompd/src/omp-debug.cpp
@@ -15,27 +15,27 @@
 
 #include "omp-debug.h"
 #include "omp.h"
-#include "ompd.h"
-// #include <stdio.h>
+#include "ompd-private.h"
 #include "TargetValue.h"
 #include <assert.h>
 #include <cstdio>
 #include <inttypes.h>
 #include <pthread.h>
 #include <stdint.h>
-#include <stdio.h>
 
-ompd_target_type_sizes_t type_sizes;
+ompd_device_type_sizes_t type_sizes;
 uint64_t ompd_state;
 
 /* --- OMPD functions ------------------------------------------------------- */
 
-/* --- 3 Initialization ----------------------------------------------------- */
+/* --- 1 Initialization ----------------------------------------------------- */
 
-ompd_rc_t ompd_initialize(const ompd_callbacks_t *table, ompd_word_t version) {
+ompd_rc_t ompd_initialize(ompd_word_t version, const ompd_callbacks_t *table) {
   ompd_rc_t ret = table ? ompd_rc_ok : ompd_rc_bad_input;
   callbacks = table;
   TValue::callbacks = table;
+  __ompd_init_icvs(table);
+  __ompd_init_states(table);
 
   return ret;
 }
@@ -57,36 +57,28 @@ ompd_process_initialize(ompd_address_space_context_t
   ompd_rc_t ret = initTypeSizes(context);
   if (ret != ompd_rc_ok)
     return ret;
-  ret = TValue(context, "ompd_rtl_version")
-            .castBase(ompd_type_int)
-            .getValue(rtl_version);
-  if ((ret == ompd_rc_ok && rtl_version < 5) ||
-      ret == ompd_rc_target_read_error)
-    return ompd_rc_incompatible;
-  if (ret != ompd_rc_ok)
-    return ret;
+
   ret = TValue(context, "ompd_state")
             .castBase(ompd_type_long_long)
             .getValue(ompd_state);
   if (ret != ompd_rc_ok)
     return ret;
-  ret = callbacks->dmemory_alloc(sizeof(ompd_address_space_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_address_space_handle_t),
                                  (void **)(addrhandle));
   if (ret != ompd_rc_ok)
     return ret;
-//  *addrhandle = new ompd_address_space_handle_t;
   if (!addrhandle)
     return ompd_rc_error;
   (*addrhandle)->context = context;
-  (*addrhandle)->kind = ompd_device_kind_host;
+  (*addrhandle)->kind = OMPD_DEVICE_KIND_HOST;
 
   return ompd_rc_ok;
 }
 
 ompd_rc_t
-ompd_get_openmp_version(ompd_address_space_handle_t
+ompd_get_omp_version(ompd_address_space_handle_t
                             *addr_handle, /* IN: handle for the address space */
-                        ompd_word_t *version) {
+                     ompd_word_t *version) {
   if (!addr_handle)
     return ompd_rc_stale_handle;
   ompd_address_space_context_t *context = addr_handle->context;
@@ -103,7 +95,7 @@ ompd_get_openmp_version(ompd_address_space_handle_t
   return ret;
 }
 
-ompd_rc_t ompd_get_openmp_version_string(
+ompd_rc_t ompd_get_omp_version_string(
     ompd_address_space_handle_t
         *addr_handle, /* IN: handle for the address space */
     const char **string) {
@@ -121,48 +113,38 @@ ompd_rc_t ompd_release_address_space_handle(
   if (!addr_handle)
     return ompd_rc_bad_input;
 
-  ompd_rc_t ret = callbacks->dmemory_free((void *)(addr_handle));
+  ompd_rc_t ret = callbacks->memory_free((void *)(addr_handle));
 //  delete addr_handle;
   return ret;
 }
 
-#if 0  // no device support yet
-ompd_rc_t ompd_device_initialize (
-    ompd_address_space_context_t *context,  /* IN: */
-    ompd_device_identifier_t id,            /* IN: object defined by native device API */
-    ompd_device_kind_t kind,                /* IN: */
-    ompd_address_space_handle_t **addrhandle  /* OUT: ompd handle for the device */
+ompd_rc_t ompd_device_initialize(
+    ompd_address_space_handle_t *process_handle,
+    ompd_address_space_context_t *device_context,
+    int kind,
+    ompd_size_t sizeof_id,
+    void *id,
+    ompd_address_space_handle_t **device_handle
     )
 {
-  if (!context)
+  if (!device_context)
     return ompd_rc_bad_input;
 
-  ompd_rc_t ret = initTypeSizes(context);
-  if (ret != ompd_rc_ok)
-    return ret;
-
+  ompd_rc_t ret;
   uint64_t ompd_num_cuda_devices;
 
-  ompd_address_space_context_t *process_context;
-  ret = callbacks->get_containing_process_context(context, &process_context);
-  if ( ret != ompd_rc_ok )
-    return ret;
-
-  ret = TValue(process_context, "ompd_num_cuda_devices").
+  ret = TValue(process_handle->context, "ompd_num_cuda_devices").
         castBase(ompd_type_long_long).
         getValue(ompd_num_cuda_devices);
-  if (ret != ompd_rc_ok) {
+  if (ret != ompd_rc_ok)
     return ret;
-  }
+
 
   for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) {
     uint64_t cuda_ctx;
 
-    /* TODO(mjm) - Hack!  Currently using ompt_parallel_id_t.  Need to find a
-     * place to define ID type information for CUDA contexts
-     */
-    ret = TValue(process_context, "ompd_CudaContextArray").
-          cast("ompt_parallel_id_t",1).           
+    ret = TValue(process_handle->context, "ompd_CudaContextArray").
+          cast("ompd_cuda_context_ptr_t",1).
           getArrayElement(i).
           castBase(ompd_type_long_long).
           getValue(cuda_ctx);
@@ -170,28 +152,24 @@ ompd_rc_t ompd_device_initialize (
     if ( ret != ompd_rc_ok )
       continue;
 
-    if (cuda_ctx == id) {
-      ret = callbacks->dmemory_alloc(sizeof(ompd_address_space_handle_t),
-                                     (void **)(addrhandle));
+    if (cuda_ctx == *((uint64_t *)id)) {
+      ret = callbacks->memory_alloc(sizeof(ompd_address_space_handle_t),
+                                     (void **)(device_handle));
       if (ret != ompd_rc_ok)
         return ret;
-//      *addrhandle = new ompd_address_space_handle_t;
-      if (!addrhandle)
+      if (!device_handle)
         return ompd_rc_error;
-      (*addrhandle)->context = context;
-
+      (*device_handle)->context = device_context;
+      (*device_handle)->kind = OMPD_DEVICE_KIND_CUDA;
+      (*device_handle)->id = (uint64_t)id;
       return ompd_rc_ok;
     }
   }
 
-  /* TODO(mjm) - Find appropriate error return result for not finding a match */
-  return ompd_rc_ok;
+  return ompd_rc_unavailable;
 }
-#endif // no device support
-
-/* --- 4 Handle Management -------------------------------------------------- */
 
-/* --- 4.1 Thread Handles --------------------------------------------------- */
+/* --- 4.5 Thread Handles --------------------------------------------------- */
 
 /* thread_handle is of type (kmp_base_info_t) */
 
@@ -206,32 +184,75 @@ ompd_rc_t ompd_get_thread_in_parallel(
     return ompd_rc_stale_handle;
   ompd_address_space_context_t *context = parallel_handle->ah->context;
   ompd_rc_t ret;
-  int i;
 
   if (!context)
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
 
-  ompd_address_t taddr;
+  ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0};
+
+  if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    uint16_t thread_idx;
+    // We cannot use the task descriptor associated with the parallel info as
+    // their task might not be currently active
+    // So to get the current thread, we access the tasks thread info and get
+    // get its threadIdx.x
+    auto TaskDescr  = TValue(context, parallel_handle->th)
+                        .cast("ompd_nvptx_parallel_info_t", 0,
+                              OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                        .access("parallel_tasks")
+                        .cast("omptarget_nvptx_TaskDescr", 1,
+                              OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                        .getArrayElement(nth_handle);
+
+    ret = TaskDescr.access("ompd_thread_info")
+                   .cast("ompd_nvptx_thread_info_t", 0,
+                         OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                   .access("threadIdx_x")
+                  .castBase(ompd_type_short)
+                  .getValue(thread_idx);
+
+    if (ret != ompd_rc_ok) {
+      return ret;
+    }
 
-  ret = TValue(context, parallel_handle->th) /* t */
-            .cast("kmp_base_team_t", 0)
-            .access("t_threads") /*t.t_threads*/
-            .cast("kmp_info_t", 2)
-            .getArrayElement(nth_handle) /*t.t_threads[nth_handle]*/
-            .access("th")                /*t.t_threads[i]->th*/
+    ret = TValue(context, NULL,
+                 "omptarget_nvptx_threadPrivateContext",
+                 OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .cast("omptarget_nvptx_ThreadPrivateContext", 1,
+                  OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .access("topTaskDescr")
+            .cast("omptarget_nvptx_TaskDescr", 2,
+                  OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .getPtrArrayElement(thread_idx)
+            .dereference()
             .getAddress(&taddr);
 
+    if (taddr.address == 0 && thread_idx % 32 == 0) {
+      ret = TaskDescr.getAddress(&taddr);
+    }
+  } else {
+    ret = TValue(context, parallel_handle->th) /* t */
+              .cast("kmp_base_team_t", 0)
+              .access("t_threads") /*t.t_threads*/
+              .cast("kmp_info_t", 2)
+              .getArrayElement(nth_handle) /*t.t_threads[nth_handle]*/
+              .access("th")                /*t.t_threads[i]->th*/
+              .getAddress(&taddr);
+  }
+
   if (ret != ompd_rc_ok)
     return ret;
-  ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t),
+
+  ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t),
                                  (void **)(thread_handle));
   if (ret != ompd_rc_ok)
     return ret;
 
   (*thread_handle)->th = taddr;
   (*thread_handle)->ah = parallel_handle->ah;
+  (*thread_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info;
   return ret;
 }
 
@@ -240,7 +261,7 @@ ompd_rc_t ompd_release_thread_handle(
     ) {
   if (!thread_handle)
     return ompd_rc_stale_handle;
-  ompd_rc_t ret = callbacks->dmemory_free((void *)(thread_handle));
+  ompd_rc_t ret = callbacks->memory_free((void *)(thread_handle));
   if (ret != ompd_rc_ok)
     return ret;
   return ompd_rc_ok;
@@ -253,30 +274,30 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1,
     return ompd_rc_stale_handle;
   if (!thread_handle_2)
     return ompd_rc_stale_handle;
+  if (thread_handle_1->ah->kind != thread_handle_2->ah->kind)
+    return ompd_rc_bad_input;
   *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address;
-  return ompd_rc_ok;
-}
+  if (*cmp_value == 0 && thread_handle_1->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    *cmp_value = thread_handle_1->cuda_kernel_info->cudaDevId -
+        thread_handle_2->cuda_kernel_info->cudaDevId;
+    if (*cmp_value == 0) {
+      *cmp_value = thread_handle_1->cuda_kernel_info->cudaContext -
+          thread_handle_2->cuda_kernel_info->cudaContext;
+    }
+    if (*cmp_value == 0) {
+      *cmp_value = thread_handle_1->cuda_kernel_info->warpSize -
+          thread_handle_2->cuda_kernel_info->warpSize;
+    }
+    if (*cmp_value == 0) {
+      *cmp_value = thread_handle_1->cuda_kernel_info->gridId -
+          thread_handle_2->cuda_kernel_info->gridId;
+    }
+  }
 
-#if 0
-ompd_rc_t ompd_get_thread_handle_string_id (
-    ompd_thread_handle_t *thread_handle,
-    char **string_id
-    )
-{
-  pthread_t thread_id;
-  ompd_rc_t ret;
-  ret = ompd_get_thread_id(thread_handle, ompd_thread_id_pthread, sizeof(pthread_t), &thread_id);
-  if (ret!=ompd_rc_ok)
-    return ret;
-  ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id);
-  if (ret!=ompd_rc_ok)
-    return ret;
-  sprintf(*string_id, "0x%llx", (long long)thread_id);
   return ompd_rc_ok;
 }
-#endif
 
-/* --- 4.2 Parallel Region Handles------------------------------------------- */
+/* --- 4.6 Parallel Region Handles------------------------------------------- */
 
 /* parallel_handle is of type (kmp_base_team_t)*/
 
@@ -289,38 +310,100 @@ ompd_rc_t ompd_get_current_parallel_handle(
   if (!thread_handle->ah)
     return ompd_rc_stale_handle;
   ompd_address_space_context_t *context = thread_handle->ah->context;
-  if (!context)
+  ompd_thread_context_t *thread_context = thread_handle->thread_context;
+  if (!context || !thread_context)
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
-  ompd_address_t taddr, lwt;
 
-  TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
-                        .cast("kmp_base_info_t")
-                        .access("th_team") /*__kmp_threads[t]->th.th_team*/
-                        .cast("kmp_team_p", 1)
-                        .access("t"); /*__kmp_threads[t]->th.th_team->t*/
+  ompd_rc_t ret;
 
-  ompd_rc_t ret = teamdata.getAddress(&taddr);
-  if (ret != ompd_rc_ok)
-    return ret;
+  if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    ompd_address_t taddr;
+    TValue ph;
+    // The ompd_parallel_info_t we need is only present in the previous task
+    // of an implicit task.
+    uint16_t task_is_implicit = 0;
+    ret = ompd_rc_ok;
+    auto possibleTaskDescr = TValue(context, thread_handle->th)
+                              .cast("omptarget_nvptx_TaskDescr", 0,
+                                     OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+
+    while (!task_is_implicit && ret == ompd_rc_ok) {
+      ret = possibleTaskDescr.access("ompd_thread_info")
+                             .cast("ompd_nvptx_thread_info_t", 0,
+                                   OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                             .access("task_implicit")
+                             .castBase()
+                             .getValue(task_is_implicit);
+      possibleTaskDescr = possibleTaskDescr.access("prev")
+                                           .cast("omptarget_nvptx_TaskDescr",
+                                                 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+      ret = possibleTaskDescr.dereference().getAddress(&taddr);
+    }
 
-  lwt.segment = OMPD_SEGMENT_UNSPECIFIED;
-  ret = teamdata.cast("kmp_base_team_t", 0)
-            .access("ompt_serialized_team_info")
-            .castBase()
-            .getValue(lwt.address);
-  if (ret != ompd_rc_ok)
-    return ret;
+    if (ret != ompd_rc_ok) {
+      if (taddr.address == 0) {
+        ph = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext")
+                 .cast("omptarget_nvptx_ThreadPrivateContext", 1,
+                       OMPD_SEGMENT_CUDA_PTX_SHARED)
+                  .access("ompd_levelZeroParallelInfo")
+                  .cast("ompd_nvptx_parallel_info_t", 0,
+                        OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+      } else {
+        return ret;
+      }
+    } else {
+      ph = possibleTaskDescr.access("ompd_thread_info")
+               .cast("ompd_nvptx_thread_info_t", 0,
+                      OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+               .access("enclosed_parallel")
+               .cast("ompd_nvptx_parallel_info_t", 0,
+                     OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+    }
+
+    ret = ph.getAddress(&taddr);
+    if (ret != ompd_rc_ok)
+      return ret;
 
-  ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t),
+    ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t),
                                  (void **)(parallel_handle));
-  if (ret != ompd_rc_ok)
-    return ret;
+    if (ret != ompd_rc_ok)
+      return ret;
+
+    (*parallel_handle)->ah = thread_handle->ah;
+    (*parallel_handle)->th = taddr;
+    (*parallel_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info;
+  } else {
+    ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}, lwt={OMPD_SEGMENT_UNSPECIFIED,0};
 
-  (*parallel_handle)->ah = thread_handle->ah;
-  (*parallel_handle)->th = taddr;
-  (*parallel_handle)->lwt = lwt;
+    TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
+                          .cast("kmp_base_info_t")
+                          .access("th_team") /*__kmp_threads[t]->th.th_team*/
+                          .cast("kmp_team_p", 1)
+                          .access("t"); /*__kmp_threads[t]->th.th_team->t*/
+
+    ret = teamdata.getAddress(&taddr);
+    if (ret != ompd_rc_ok)
+      return ret;
+
+    lwt.segment = OMPD_SEGMENT_UNSPECIFIED;
+    ret = teamdata.cast("kmp_base_team_t", 0)
+              .access("ompt_serialized_team_info")
+              .castBase()
+              .getValue(lwt.address);
+    if (ret != ompd_rc_ok)
+      return ret;
+
+    ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t),
+                                   (void **)(parallel_handle));
+    if (ret != ompd_rc_ok)
+      return ret;
+
+    (*parallel_handle)->ah = thread_handle->ah;
+    (*parallel_handle)->th = taddr;
+    (*parallel_handle)->lwt = lwt;
+  }
   return ompd_rc_ok;
 }
 
@@ -339,47 +422,124 @@ ompd_rc_t ompd_get_enclosing_parallel_handle(
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
-  ompd_address_t taddr = parallel_handle->th, lwt;
 
-  ompd_rc_t ret = ompd_rc_stale_handle;
-  TValue lwtValue = TValue(context, parallel_handle->lwt);
-  if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0
-  {                                      // if we are in lwt, get parent
-    ret = lwtValue.cast("ompt_lw_taskteam_t", 0)
-              .access("parent")
-              .cast("ompt_lw_taskteam_t", 1)
-              .dereference()
-              .getAddress(&lwt);
-  }
-  if (ret != ompd_rc_ok) { // no lwt or parent==0x0
+  ompd_address_t taddr = parallel_handle->th, lwt={OMPD_SEGMENT_UNSPECIFIED,0};
+  ompd_rc_t ret;
 
-    TValue teamdata =
-        TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/
-            .cast("kmp_base_team_t", 0)      /*t*/
-            .access("t_parent")              /*t.t_parent*/
-            .cast("kmp_team_p", 1)
-            .access("t"); /*t.t_parent->t*/
+  if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    uint16_t level;
+    TValue curParallelInfo = TValue(context, taddr)
+                             .cast("ompd_nvptx_parallel_info_t", 0,
+                                   OMPD_SEGMENT_CUDA_PTX_SHARED);
+
+    ret = curParallelInfo
+            .cast("ompd_nvptx_parallel_info_t", 0,
+                  OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .access("level")
+            .castBase(ompd_type_short)
+            .getValue(level);
 
-    ret = teamdata.getAddress(&taddr);
     if (ret != ompd_rc_ok)
       return ret;
 
-    lwt.segment = OMPD_SEGMENT_UNSPECIFIED;
-    ret = teamdata.cast("kmp_base_team_t", 0)
-              .access("ompt_serialized_team_info")
-              .castBase()
-              .getValue(lwt.address);
-    if (ret != ompd_rc_ok)
+    TValue prevTaskDescr = curParallelInfo.cast("ompd_nvptx_parallel_info_t", 0,
+                                                OMPD_SEGMENT_CUDA_PTX_SHARED)
+                                          .access("parallel_tasks")
+                                          .cast("omptarget_nvptx_TaskDescr", 1,
+                                                OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                                          .access("prev")
+                                          .cast("omptarget_nvptx_TaskDescr", 1,
+                                                OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                                          .dereference();
+
+    ret = prevTaskDescr.getAddress(&taddr);
+
+    // If the previous task of the tasks of the current parallel region is
+    // NULL, then we got the parallel handle for the (implicit?) top level
+    // task which has no enclosing task.
+    if (ret != ompd_rc_ok) {
+      return ret;
+    }
+
+    // The instance of TaskDescr for the previous task contains the parallel
+    // info for the current parallel region. So we have to go back to the
+    // previous task of the previous task
+    prevTaskDescr = prevTaskDescr.access("prev")
+                                 .cast("omptarget_nvptx_TaskDescr", 1,
+                                       OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                                 .dereference();
+
+    ret = prevTaskDescr.getAddress(&taddr);
+
+      if (ret != ompd_rc_ok) {
+      if (taddr.address == 0 && level == 1) {
+        // If we are in generic mode, there is an implicit parallel region
+        // around the master thread
+        prevTaskDescr = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext",
+                               OMPD_SEGMENT_CUDA_PTX_SHARED)
+                            .cast("omptarget_nvptx_ThreadPrivateContext", 1,
+                                  OMPD_SEGMENT_CUDA_PTX_SHARED)
+                            .access("ompd_levelZeroParallelInfo");
+      } else {
+        return ret;
+      }
+    } else {
+      prevTaskDescr = prevTaskDescr.access("ompd_thread_info")
+                                   .cast("ompd_nvptx_thread_info_t", 0,
+                                         OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                                   .access("enclosed_parallel");
+    }
+
+    ret = prevTaskDescr.cast("ompd_nvptx_parallel_info_t", 0,
+                             OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                       .getAddress(&taddr);
+
+    if (ret != ompd_rc_ok) {
       return ret;
+    }
+  } else {
+    ret = ompd_rc_stale_handle;
+    TValue lwtValue = TValue(context, parallel_handle->lwt);
+    if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0
+    {                                      // if we are in lwt, get parent
+      ret = lwtValue.cast("ompt_lw_taskteam_t", 0)
+                .access("parent")
+                .cast("ompt_lw_taskteam_t", 1)
+                .dereference()
+                .getAddress(&lwt);
+    }
+    if (ret != ompd_rc_ok) { // no lwt or parent==0x0
+
+      TValue teamdata =
+          TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/
+              .cast("kmp_base_team_t", 0)      /*t*/
+              .access("t_parent")              /*t.t_parent*/
+              .cast("kmp_team_p", 1)
+              .access("t"); /*t.t_parent->t*/
+
+      ret = teamdata.getAddress(&taddr);
+      if (ret != ompd_rc_ok)
+        return ret;
+
+      lwt.segment = OMPD_SEGMENT_UNSPECIFIED;
+      ret = teamdata.cast("kmp_base_team_t", 0)
+                .access("ompt_serialized_team_info")
+                .castBase()
+                .getValue(lwt.address);
+      if (ret != ompd_rc_ok)
+        return ret;
+    }
   }
 
-  ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t),
                                  (void **)(enclosing_parallel_handle));
   if (ret != ompd_rc_ok)
     return ret;
   (*enclosing_parallel_handle)->th = taddr;
   (*enclosing_parallel_handle)->lwt = lwt;
   (*enclosing_parallel_handle)->ah = parallel_handle->ah;
+  (*enclosing_parallel_handle)->cuda_kernel_info =
+      parallel_handle->cuda_kernel_info;
   return ompd_rc_ok;
 }
 
@@ -398,20 +558,66 @@ ompd_rc_t ompd_get_task_parallel_handle(
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
-  ompd_address_t taddr;
+  ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0};
 
   ompd_rc_t ret;
-  ret = TValue(context, task_handle->th)
-            .cast("kmp_taskdata_t") /*td*/
-            .access("td_team")      /*td.td_team*/
-            .cast("kmp_team_p", 1)
-            .access("t") /*td.td_team->t*/
-            .getAddress(&taddr);
 
+  if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    TValue parallelHandle;
+    // The ompd_parallel_info_t we need is only present in the previous task
+    // of an implicit task.
+    uint16_t task_is_implicit = 0;
+    ret = ompd_rc_ok;
+    auto possibleTaskDescr = TValue(context, task_handle->th)
+                              .cast("omptarget_nvptx_TaskDescr", 0,
+                                     OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+
+    while (!task_is_implicit && ret == ompd_rc_ok) {
+      ret = possibleTaskDescr.access("ompd_thread_info")
+                             .cast("ompd_nvptx_thread_info_t", 0,
+                                   OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                             .access("task_implicit")
+                             .castBase()
+                             .getValue(task_is_implicit);
+      possibleTaskDescr = possibleTaskDescr.access("prev")
+                                           .cast("omptarget_nvptx_TaskDescr",
+                                                 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+      ret = possibleTaskDescr.dereference().getAddress(&taddr);
+    }
+
+    if (ret != ompd_rc_ok) {
+      if (taddr.address == 0) {
+        parallelHandle = TValue(context, NULL,
+                                "omptarget_nvptx_threadPrivateContext")
+                            .cast("omptarget_nvptx_ThreadPrivateContext", 1,
+                                  OMPD_SEGMENT_CUDA_PTX_SHARED)
+                            .access("ompd_levelZeroParallelInfo")
+                            .cast("ompd_nvptx_parallel_info_t", 0,
+                                  OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+      } else {
+        return ret;
+      }
+    } else {
+      parallelHandle = possibleTaskDescr.access("ompd_thread_info")
+                                        .cast("ompd_nvptx_thread_info_t", 0,
+                                              OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                                        .access("enclosed_parallel")
+                                        .cast("ompd_nvptx_parallel_info_t", 0,
+                                              OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+    }
+    ret = parallelHandle.getAddress(&taddr);
+  } else {
+    ret = TValue(context, task_handle->th)
+              .cast("kmp_taskdata_t") /*td*/
+              .access("td_team")      /*td.td_team*/
+              .cast("kmp_team_p", 1)
+              .access("t") /*td.td_team->t*/
+              .getAddress(&taddr);
+  }
   if (ret != ompd_rc_ok)
     return ret;
 
-  ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t),
                                  (void **)(enclosing_parallel_handle));
   if (ret != ompd_rc_ok)
     return ret;
@@ -419,6 +625,7 @@ ompd_rc_t ompd_get_task_parallel_handle(
   (*enclosing_parallel_handle)->ah = task_handle->ah;
   (*enclosing_parallel_handle)->lwt = task_handle->lwt;
   (*enclosing_parallel_handle)->th = taddr;
+  (*enclosing_parallel_handle)->cuda_kernel_info = task_handle->cuda_kernel_info;
   return ompd_rc_ok;
 }
 
@@ -427,7 +634,7 @@ ompd_rc_t ompd_release_parallel_handle(
     ) {
   if (!parallel_handle)
     return ompd_rc_stale_handle;
-  ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle));
+  ompd_rc_t ret = callbacks->memory_free((void *)(parallel_handle));
   if (ret != ompd_rc_ok)
     return ret;
   return ompd_rc_ok;
@@ -441,38 +648,25 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1,
     return ompd_rc_stale_handle;
   if (!parallel_handle_2)
     return ompd_rc_stale_handle;
-  if (parallel_handle_1->th.address - parallel_handle_2->th.address)
+  if (parallel_handle_1->ah->kind != parallel_handle_2->ah->kind)
+    return ompd_rc_bad_input;
+  if (parallel_handle_1->ah->kind == OMPD_DEVICE_KIND_HOST) {
+    if (parallel_handle_1->th.address - parallel_handle_2->th.address)
+      *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address;
+    else
+      *cmp_value =
+          parallel_handle_1->lwt.address - parallel_handle_2->lwt.address;
+  } else {
     *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address;
-  else
-    *cmp_value =
-        parallel_handle_1->lwt.address - parallel_handle_2->lwt.address;
+  }
   return ompd_rc_ok;
 }
 
-#if 0 // parallel-id is initialized to zero
-ompd_rc_t ompd_get_parallel_handle_string_id (
-    ompd_parallel_handle_t *parallel_handle,
-    char **string_id
-    )
-{
-    ompd_parallel_id_t id;
-    ompd_rc_t ret;
-    ret = ompd_get_parallel_id(parallel_handle, &id);
-    if (ret!=ompd_rc_ok)
-      return ret;
-    ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id);
-    if (ret!=ompd_rc_ok)
-      return ret;
-    sprintf(*string_id, "0x%llx", (long long)id);
-    return ompd_rc_ok;
-}
-#endif
-
-/* --- 4.3 Task Handles ----------------------------------------------------- */
+/* --- 4.7 Task Handles ----------------------------------------------------- */
 
 /* task_handle is of type (kmp_taskdata_t) */
 
-ompd_rc_t ompd_get_current_task__handle(
+ompd_rc_t ompd_get_current_task_handle(
     ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
     ompd_task_handle_t **task_handle     /* OUT: OpenMP task handle */
     ) {
@@ -485,31 +679,38 @@ ompd_rc_t ompd_get_current_task__handle(
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
-  ompd_address_t taddr, lwt;
+  ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}, lwt={OMPD_SEGMENT_UNSPECIFIED,0};
+  ompd_rc_t ret = ompd_rc_ok;
 
-  TValue taskdata =
-      TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
-          .cast("kmp_base_info_t")
-          .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/
-          .cast("kmp_taskdata_t", 1);
+  lwt.segment = OMPD_SEGMENT_UNSPECIFIED;
 
-  ompd_rc_t ret = taskdata.dereference().getAddress(&taddr);
-  if (ret != ompd_rc_ok)
-    return ret;
+  if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    lwt.address = 0;
+    taddr = thread_handle->th;
+  } else {
+    TValue taskdata =
+        TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
+            .cast("kmp_base_info_t")
+            .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/
+            .cast("kmp_taskdata_t", 1);
 
-  lwt.segment = OMPD_SEGMENT_UNSPECIFIED;
-  ret = taskdata
-            .access("td_team") /*td.td_team*/
-            .cast("kmp_team_p", 1)
-            .access("t") /*td.td_team->t*/
-            .cast("kmp_base_team_t", 0)
-            .access("ompt_serialized_team_info")
-            .castBase()
-            .getValue(lwt.address);
+    ret = taskdata.dereference().getAddress(&taddr);
+    if (ret != ompd_rc_ok)
+      return ret;
+
+    ret = taskdata
+              .access("td_team") /*td.td_team*/
+              .cast("kmp_team_p", 1)
+              .access("t") /*td.td_team->t*/
+              .cast("kmp_base_team_t", 0)
+              .access("ompt_serialized_team_info")
+              .castBase()
+              .getValue(lwt.address);
+  }
   if (ret != ompd_rc_ok)
     return ret;
 
-  ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t),
                                  (void **)(task_handle));
   if (ret != ompd_rc_ok)
     return ret;
@@ -517,13 +718,19 @@ ompd_rc_t ompd_get_current_task__handle(
   (*task_handle)->th = taddr;
   (*task_handle)->lwt = lwt;
   (*task_handle)->ah = thread_handle->ah;
+  (*task_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info;
   return ompd_rc_ok;
 }
 
-ompd_rc_t ompd_get_generating_ancestor_task_handle(
+ompd_rc_t ompd_get_generating_task_handle(
     ompd_task_handle_t *task_handle,        /* IN: OpenMP task handle */
     ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */
     ) {
+  // Generating and Scheduling task are the same on cuda?
+  if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    return ompd_get_scheduling_task_handle(task_handle, parent_task_handle);
+  }
+
   if (!task_handle)
     return ompd_rc_stale_handle;
   if (!task_handle->ah)
@@ -533,7 +740,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle(
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
-  ompd_address_t taddr = task_handle->th, lwt;
+  ompd_address_t taddr = task_handle->th, lwt={OMPD_SEGMENT_UNSPECIFIED,0};
 
   ompd_rc_t ret = ompd_rc_stale_handle;
   TValue lwtValue = TValue(context, task_handle->lwt);
@@ -569,7 +776,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle(
       return ret;
   }
 
-  ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t),
                                  (void **)(parent_task_handle));
   if (ret != ompd_rc_ok)
     return ret;
@@ -580,7 +787,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle(
   return ret;
 }
 
-ompd_rc_t ompd_get_scheduling_ancestor_task_handle(
+ompd_rc_t ompd_get_scheduling_task_handle(
     ompd_task_handle_t *task_handle,        /* IN: OpenMP task handle */
     ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */
     ) {
@@ -593,27 +800,47 @@ ompd_rc_t ompd_get_scheduling_ancestor_task_handle(
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
-  ompd_address_t taddr;
+  ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0};
+  ompd_rc_t ret;
 
-  ompd_rc_t ret =
-      TValue(context, task_handle->th)
-          .cast("kmp_taskdata_t")   /*td*/
-          .access("ompt_task_info") // td->ompt_task_info
-          .cast("ompt_task_info_t")
-          .access("scheduling_parent") // td->ompd_task_info.scheduling_parent
-          .cast("kmp_taskdata_t", 1)
-          .dereference()
-          .getAddress(&taddr);
+  if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    ret = TValue(context, task_handle->th)
+            .cast("omptarget_nvptx_TaskDescr", 0,
+                  OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .access("prev")
+            .cast("omptarget_nvptx_TaskDescr", 1,
+                  OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .dereference()
+            .getAddress(&taddr);
+    if (taddr.address == 0) {
+      return ompd_rc_unavailable;
+    }
+  } else {
+    ret =
+        TValue(context, task_handle->th)
+            .cast("kmp_taskdata_t")   /*td*/
+            .access("ompt_task_info") // td->ompt_task_info
+            .cast("ompt_task_info_t")
+            .access("scheduling_parent") // td->ompd_task_info.scheduling_parent
+            .cast("kmp_taskdata_t", 1)
+            .castBase()
+            .getValue(taddr.address);
+    if (taddr.address == 0) {
+      return ompd_rc_unavailable;
+    }
+  }
 
   if (ret != ompd_rc_ok)
     return ret;
-  ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t),
                                  (void **)(parent_task_handle));
   if (ret != ompd_rc_ok)
     return ret;
 
   (*parent_task_handle)->th = taddr;
+  (*parent_task_handle)->lwt = {OMPD_SEGMENT_UNSPECIFIED,0};
   (*parent_task_handle)->ah = task_handle->ah;
+  (*parent_task_handle)->cuda_kernel_info = task_handle->cuda_kernel_info;
   return ret;
 }
 
@@ -634,24 +861,38 @@ ompd_rc_t ompd_get_task_in_parallel(
   assert(callbacks && "Callback table not initialized!");
 
   ompd_rc_t ret;
-  ompd_address_t taddr;
-  ret = TValue(context, parallel_handle->th) /* t */
-            .cast("kmp_base_team_t", 0)
-            .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/
-            .cast("kmp_taskdata_t", 1)
-            .getArrayElement(
-                nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/
-            .getAddress(&taddr);
+  ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0};
+
+  if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    ret = TValue(context, parallel_handle->th)
+              .cast("ompd_nvptx_paralel_info", 0,
+                    OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+              .access("parallel_tasks")
+              .cast("omptarget_nvptx_TaskDescr", 1,
+                    OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+              .getArrayElement(nth_handle)
+              .getAddress(&taddr);
+  } else {
+    ret = TValue(context, parallel_handle->th) /* t */
+              .cast("kmp_base_team_t", 0)
+              .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/
+              .cast("kmp_taskdata_t", 1)
+              .getArrayElement(
+                  nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/
+              .getAddress(&taddr);
+  }
 
   if (ret != ompd_rc_ok)
     return ret;
-  ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t),
+  ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t),
                                  (void **)(task_handle));
   if (ret != ompd_rc_ok)
     return ret;
 
   (*task_handle)->th = taddr;
   (*task_handle)->ah = parallel_handle->ah;
+  (*task_handle)->lwt = {OMPD_SEGMENT_UNSPECIFIED,0};
+  (*task_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info;
   return ret;
 }
 
@@ -660,7 +901,7 @@ ompd_rc_t ompd_release_task_handle(
     ) {
   if (!task_handle)
     return ompd_rc_stale_handle;
-  ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle));
+  ompd_rc_t ret = callbacks->memory_free((void *)(task_handle));
   if (ret != ompd_rc_ok)
     return ret;
   return ompd_rc_ok;
@@ -673,38 +914,26 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1,
     return ompd_rc_stale_handle;
   if (!task_handle_2)
     return ompd_rc_stale_handle;
-  if (task_handle_1->th.address - task_handle_2->th.address)
+  if (task_handle_1->ah->kind != task_handle_2->ah->kind)
+    return ompd_rc_bad_input;
+  if (task_handle_1->th.address - task_handle_2->th.address ||
+        task_handle_1->ah->kind == OMPD_DEVICE_KIND_CUDA)
     *cmp_value = task_handle_1->th.address - task_handle_2->th.address;
   else
     *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address;
   return ompd_rc_ok;
 }
 
-#if 0 // all task ids are initialized to zero
-ompd_rc_t ompd_get_task_handle_string_id (
-    ompd_task_handle_t *task_handle,
-    char **string_id
-    )
-{
-    ompd_task_id_t id;
-    ompd_rc_t ret = ompd_get_task_id(task_handle, &id);
-    if (ret!=ompd_rc_ok)
-      return ret;
-    ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id);
-    if (ret!=ompd_rc_ok)
-      return ret;
-    sprintf(*string_id, "0x%llx", (long long)id);
-    return ompd_rc_ok;
-}
-#endif
+/* --- 7 Thread Inquiry ----------------------------------------------------- */
 
-/* --- 5 Process and Thread Settings ---------------------------------------- */
+/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */
 
 ompd_rc_t
-ompd_get_num_procs(ompd_address_space_handle_t
-                       *addr_handle, /* IN: handle for the address space */
-                   ompd_word_t *val  /* OUT: number of processes */
-                   ) {
+ompd_get_thread_handle(ompd_address_space_handle_t
+                           *addr_handle, /* IN: handle for the address space */
+                       ompd_thread_id_t kind,
+                       ompd_size_t sizeof_thread_id, const void *thread_id,
+                       ompd_thread_handle_t **thread_handle) {
   if (!addr_handle)
     return ompd_rc_stale_handle;
   ompd_address_space_context_t *context = addr_handle->context;
@@ -714,237 +943,91 @@ ompd_get_num_procs(ompd_address_space_handle_t
     return ompd_rc_stale_handle;
 
   assert(callbacks && "Callback table not initialized!");
+  ompd_thread_context_t *tcontext;
+  ret = callbacks->get_thread_context_for_thread_id(
+      context, kind, sizeof_thread_id, thread_id, &tcontext);
+  if (ret != ompd_rc_ok)
+    return ret;
 
-  int nth;
-  ret = TValue(context, "__kmp_avail_proc")
-            .castBase("__kmp_avail_proc")
-            .getValue(nth);
-  *val = nth;
-  return ret;
-}
-
-ompd_rc_t
-ompd_get_thread_limit(ompd_address_space_handle_t
-                          *addr_handle, /* IN: handle for the address space */
-                      ompd_word_t *val  /* OUT: max number of threads */
-                      ) {
-  if (!addr_handle)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = addr_handle->context;
-  ompd_rc_t ret;
+  int tId;
 
-  if (!context)
-    return ompd_rc_stale_handle;
+  if (kind == OMPD_THREAD_ID_CUDALOGICAL) {
+    ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id;
 
-  assert(callbacks && "Callback table not initialized!");
+    // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]
+    TValue th = TValue(context, tcontext,
+                       "omptarget_nvptx_threadPrivateContext",
+                       OMPD_SEGMENT_CUDA_PTX_SHARED)
+                .cast("omptarget_nvptx_ThreadPrivateContext", 1,
+                      OMPD_SEGMENT_CUDA_PTX_SHARED)
+                .access("topTaskDescr")
+                .cast("omptarget_nvptx_TaskDescr", 1,
+                      OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                .getPtrArrayElement(p->threadIdx.x)
+                .dereference();
+
+    ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0};
+    ret = th.getAddress(&taddr);
 
-  int nth;
-  ret =
-      TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth);
-  *val = nth;
-  return ret;
-}
+    if (ret != ompd_rc_ok) {
+      if (taddr.address == 0 && p->threadIdx.x % 32 == 0) {
+        // check for the master task/thread instead
+        // The master thread should never have the threadIdx.x of zero, so
+        // checking it this way should be safe
 
-/* --- 6 Parallel Region Inqueries ------------------------------------------ */
-/* --- 6.1 Settings --------------------------------------------------------- */
+        th = TValue(context, tcontext,
+                    "omptarget_nvptx_threadPrivateContext",
+                    OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .cast("omptarget_nvptx_ThreadPrivateContext", 1,
+                   OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .access("teamContext")
+            .cast("omptarget_nvptx_TeamDescr", 0,
+                  OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .access("levelZeroTaskDescr");
 
-ompd_rc_t ompd_get_num_threads(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_word_t *val                         /* OUT: number of threads */
-    ) {
-  if (!parallel_handle)
-    return ompd_rc_stale_handle;
-  if (!parallel_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = parallel_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
+        ret = th.getAddress(&taddr);
 
-  assert(callbacks && "Callback table not initialized!");
+        if (ret != ompd_rc_ok)
+          return ret;
+      } else {
+        return ret;
+      }
+    }
 
-  ompd_rc_t ret = ompd_rc_ok;
-  if (parallel_handle->lwt.address != 0)
-    *val = 1;
-  else {
-    uint32_t res;
-    ret = TValue(context, parallel_handle->th)
-              .cast("kmp_base_team_t", 0) /*t*/
-              .access("t_nproc")          /*t.t_nproc*/
-              .castBase()
-              .getValue(res);
-    *val = res;
-  }
-  return ret;
-}
+    // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]
+    //                                     ->ompd_thread_info.threadIdx_x
+    ret = th.cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .access("ompd_thread_info")
+            .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .access("threadIdx_x")
+            .castBase(ompd_type_short)
+            .getValue(tId);
 
-ompd_rc_t ompd_get_level(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_word_t *val                         /* OUT: nesting level */
-    ) {
-  if (!parallel_handle)
-    return ompd_rc_stale_handle;
-  if (!parallel_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = parallel_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
+    if (ret != ompd_rc_ok)
+      return ret;
 
-  assert(callbacks && "Callback table not initialized!");
+    if (tId != p->threadIdx.x) {
+        return ompd_rc_stale_handle;
+    }
 
-  uint32_t res;
+    // allocate both the thread handle and the cuda kernel info in one go
+    ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t) +
+                                  sizeof(ompd_cuda_thread_kernel_info_t),
+                                  (void **)(thread_handle));
+    if (ret != ompd_rc_ok)
+      return ret;
 
-  ompd_rc_t ret = TValue(context, parallel_handle->th)
-                      .cast("kmp_base_team_t", 0) /*t*/
-                      .access("t_level")          /*t.t_level*/
-                      .castBase()
-                      .getValue(res);
-  *val = res;
-  return ret;
-}
-
-ompd_rc_t ompd_get_active_level(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_word_t *val                         /* OUT: active nesting level */
-    ) {
-  if (!parallel_handle)
-    return ompd_rc_stale_handle;
-  if (!parallel_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = parallel_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  uint32_t res;
-
-  ompd_rc_t ret = TValue(context, parallel_handle->th)
-                      .cast("kmp_base_team_t", 0) /*t*/
-                      .access("t_active_level")   /*t.t_active_level*/
-                      .castBase()
-                      .getValue(res);
-  *val = res;
-  return ret;
-}
-
-/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */
-
-ompd_rc_t ompd_get_parallel_data(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_address_t *data                     /* OUT: OpenMP parallel id */
-    ) {
-  if (!parallel_handle)
-    return ompd_rc_stale_handle;
-  if (!parallel_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = parallel_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-  if (!ompd_state)
-    return ompd_rc_needs_state_tracking;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  TValue teamInfo;
-  if (parallel_handle->lwt.address != 0)
-    teamInfo = TValue(context, parallel_handle->lwt)
-                   .cast("ompt_lw_taskteam_t", 0); /*lwt*/
-  else
-    teamInfo =
-        TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/
-  ompd_rc_t ret = teamInfo
-                      .access("ompt_team_info") /*t.ompt_team_info*/
-                      .cast("ompt_team_info_t", 0)
-                      .access("parallel_data") /*t.ompt_team_info.parallel_id*/
-                      .getAddress(data);
-  return ret;
-}
-
-#if 0  // there is no such thing as a parallel function
-ompd_rc_t ompd_get_parallel_function(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */
-    )
-{
-  if (!parallel_handle)
-    return ompd_rc_stale_handle;
-  if (!parallel_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = parallel_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-  if (!ompd_state)
-    return ompd_rc_needs_state_tracking;
-
-  assert(callbacks && "Callback table not initialized!");
-  parallel_addr->segment = OMPD_SEGMENT_UNSPECIFIED;
-
-  TValue teamInfo;
-  if(parallel_handle->lwt.address!=0)
-    teamInfo = TValue(context, parallel_handle->lwt).
-          cast("ompt_lw_taskteam_t",0);		/*lwt*/
-  else
-    teamInfo = TValue(context, parallel_handle->th).
-          cast("kmp_base_team_t",0);		/*t*/
-  ompd_rc_t ret = teamInfo.
-        access("ompt_team_info").             /*t.ompt_team_info*/
-        cast("ompt_team_info_t",0).
-        access("microtask").                /*t.ompt_team_info.microtask*/
-        castBase().    
-        getValue(parallel_addr->address);
-  return ret;
-}
-#endif // no parallel function
-
-/* --- 7 Thread Inquiry ----------------------------------------------------- */
-
-/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */
-
-ompd_rc_t
-ompd_get_thread_handle(ompd_address_space_handle_t
-                           *addr_handle, /* IN: handle for the address space */
-                       ompd_thread_id_kind_t kind,
-                       ompd_size_t sizeof_thread_id, const void *thread_id,
-                       ompd_thread_handle_t **thread_handle) {
-  if (!addr_handle)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = addr_handle->context;
-  ompd_rc_t ret;
-
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-  ompd_thread_context_t *tcontext;
-  ret = callbacks->get_thread_context_for_thread_id(
-      context, kind, sizeof_thread_id, thread_id, &tcontext);
-  if (ret != ompd_rc_ok)
-    return ret;
-
-  int tId;
-
-  if (kind == ompd_thread_id_cudalogical) {
-    ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id;
-
-    // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->data.items.threadId
-
-    ret =
-        TValue(context, tcontext, "omptarget_nvptx_threadPrivateContext",
-               OMPD_SEGMENT_CUDA_PTX_SHARED)
-            .cast("omptarget_nvptx_ThreadPrivateContext", 1,
-                  OMPD_SEGMENT_CUDA_PTX_SHARED)
-            .access("topTaskDescr")
-            .cast("omptarget_nvptx_TaskDescr", 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL)
-            .getArrayElement(p->threadIdx.x)
-            .access("data__items__threadId")
-            .castBase(ompd_type_short)
-            .getValue(tId);
-
-    if (ret != ompd_rc_ok)
-      return ret;
-
-    if (tId != p->threadIdx.x)
-      return ompd_rc_stale_handle;
+    (*thread_handle)->ah = addr_handle;
+    (*thread_handle)->th = taddr;
+    (*thread_handle)->cuda_kernel_info =
+        (ompd_cuda_thread_kernel_info_t*)((*thread_handle) + 1);
+
+    (*thread_handle)->cuda_kernel_info->cudaDevId = p->cudaDevId;
+    (*thread_handle)->cuda_kernel_info->cudaContext = p->cudaContext;
+    (*thread_handle)->cuda_kernel_info->warpSize = p->warpSize;
+    (*thread_handle)->cuda_kernel_info->gridId = p->gridId;
+    (*thread_handle)->cuda_kernel_info->gridDim = p->gridDim;
+    (*thread_handle)->cuda_kernel_info->blockDim = p->blockDim;
   } else {
     ret = TValue(context, tcontext, "__kmp_gtid")
               .castBase("__kmp_gtid")
@@ -960,16 +1043,17 @@ ompd_get_thread_handle(ompd_address_space_handle_t
                     .getArrayElement(tId) /*__kmp_threads[t]*/
                     .access("th");        /*__kmp_threads[t]->th*/
 
-    ompd_address_t taddr;
+    ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0};
     ret = th.getAddress(&taddr);
     if (ret != ompd_rc_ok)
       return ret;
-    ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t),
+    ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t),
                                    (void **)(thread_handle));
     if (ret != ompd_rc_ok)
       return ret;
     (*thread_handle)->ah = addr_handle;
     (*thread_handle)->th = taddr;
+    (*thread_handle)->cuda_kernel_info = NULL;
 
 #ifndef NDEBUG
     if (ret != ompd_rc_ok)
@@ -990,13 +1074,14 @@ ompd_get_thread_handle(ompd_address_space_handle_t
            "Callback table not initialized!");
 #endif
   }
+  (*thread_handle)->thread_context = tcontext;
   return ret;
 }
 
 ompd_rc_t ompd_get_thread_id(
     ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
-    ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id) {
-  if (kind != ompd_thread_id_pthread)
+    ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id) {
+  if (kind != OMPD_THREAD_ID_PTHREAD && kind != OMPD_THREAD_ID_CUDALOGICAL)
     return ompd_rc_bad_input;
   if (!thread_handle)
     return ompd_rc_stale_handle;
@@ -1005,52 +1090,68 @@ ompd_rc_t ompd_get_thread_id(
   ompd_address_space_context_t *context = thread_handle->ah->context;
   if (!context)
     return ompd_rc_stale_handle;
-  ompd_size_t size;
-  ompd_rc_t ret = tf.getType(context, "kmp_thread_t").getSize(&size);
-  if (ret != ompd_rc_ok)
-    return ret;
-  if (sizeof_thread_id != size)
-    return ompd_rc_bad_input;
+  ompd_rc_t ret;
 
-  assert(callbacks && "Callback table not initialized!");
+  if (kind == OMPD_THREAD_ID_CUDALOGICAL) {
+    if (sizeof_thread_id != sizeof(ompd_cudathread_coord_t)) {
+      return ompd_rc_bad_input;
+    }
+    ompd_cudathread_coord_t *cuda_thread_id =
+        (ompd_cudathread_coord_t*)thread_id;
+    cuda_thread_id->cudaDevId = thread_handle->cuda_kernel_info->cudaDevId;
+    cuda_thread_id->cudaContext = thread_handle->cuda_kernel_info->cudaContext;
+    cuda_thread_id->warpSize = thread_handle->cuda_kernel_info->warpSize;
+    cuda_thread_id->gridId = thread_handle->cuda_kernel_info->gridId;
+
+    auto threadInfo = TValue(context, thread_handle->th)
+                        .cast("omptarget_nvptx_TaskDescr", 0,
+                              OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                        .access("ompd_thread_info")
+                        .cast("ompd_nvptx_thread_info_t", 0,
+                              OMPD_SEGMENT_CUDA_PTX_GLOBAL);
+
+    ret = threadInfo.access("threadIdx_x")
+                    .castBase()
+                    .getValue(cuda_thread_id->threadIdx.x);
 
-  ret = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
-            .cast("kmp_base_info_t")
-            .access("th_info") /*__kmp_threads[t]->th.th_info*/
-            .cast("kmp_desc_t")
-            .access("ds") /*__kmp_threads[t]->th.th_info.ds*/
-            .cast("kmp_desc_base_t")
-            .access("ds_thread") /*__kmp_threads[t]->th.th_info.ds.ds_thread*/
-            .cast("kmp_thread_t")
-            .getRawValue(thread_id, 1);
-  return ret;
-}
+    if (ret != ompd_rc_ok)
+      return ret;
 
-ompd_rc_t ompd_get_thread_num(
-    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
-    ompd_word_t *val /* OUT: number of the thread within the team */
-    ) {
-  // __kmp_threads[8]->th.th_info.ds.ds_tid
-  if (!thread_handle)
-    return ompd_rc_stale_handle;
-  if (!thread_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = thread_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
+    cuda_thread_id->threadIdx.y = cuda_thread_id->threadIdx.z = 0;
 
-  assert(callbacks && "Callback table not initialized!");
+    ret = threadInfo.access("blockIdx_x")
+                    .castBase()
+                    .getValue(cuda_thread_id->blockIdx.x);
 
-  ompd_rc_t ret =
-      TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
-          .cast("kmp_base_info_t")
-          .access("th_info") /*__kmp_threads[t]->th.th_info*/
-          .cast("kmp_desc_t")
-          .access("ds") /*__kmp_threads[t]->th.th_info.ds*/
-          .cast("kmp_desc_base_t")
-          .access("ds_tid") /*__kmp_threads[t]->th.th_info.ds.ds_tid*/
-          .castBase()
-          .getValue(*val);
+    if (ret != ompd_rc_ok)
+      return ret;
+
+    cuda_thread_id->blockIdx.y = cuda_thread_id->blockIdx.z = 0;
+
+    cuda_thread_id->gridDim = thread_handle->cuda_kernel_info->gridDim;
+    cuda_thread_id->blockDim = thread_handle->cuda_kernel_info->blockDim;
+
+    return ompd_rc_ok;
+  } else {
+    ompd_size_t size;
+    ret = tf.getType(context, "kmp_thread_t").getSize(&size);
+    if (ret != ompd_rc_ok)
+      return ret;
+    if (sizeof_thread_id != size)
+      return ompd_rc_bad_input;
+
+    assert(callbacks && "Callback table not initialized!");
+
+    ret = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
+              .cast("kmp_base_info_t")
+              .access("th_info") /*__kmp_threads[t]->th.th_info*/
+              .cast("kmp_desc_t")
+              .access("ds") /*__kmp_threads[t]->th.th_info.ds*/
+              .cast("kmp_desc_base_t")
+              .access("ds_thread") /*__kmp_threads[t]->th.th_info.ds.ds_thread*/
+              .cast("kmp_thread_t")
+              .getRawValue(thread_id, 1);
+  }
   return ret;
 }
 
@@ -1071,26 +1172,38 @@ ompd_rc_t ompd_get_state(
   if (!ompd_state)
     return ompd_rc_needs_state_tracking;
 
+  ompd_rc_t ret;
   assert(callbacks && "Callback table not initialized!");
 
-  TValue ompt_thread_info =
-      TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
-          .cast("kmp_base_info_t")
-          .access("ompt_thread_info") /*__kmp_threads[t]->th.ompt_thread_info*/
-          .cast("ompt_thread_info_t");
-  if (ompt_thread_info.gotError())
-    return ompt_thread_info.getError();
-  ompd_rc_t ret =
-      ompt_thread_info
-          .access("state") /*__kmp_threads[t]->th.ompt_thread_info.state*/
-          .castBase()
-          .getValue(*state);
-  if (ret != ompd_rc_ok)
-    return ret;
-  ret = ompt_thread_info
-            .access("wait_id") /*__kmp_threads[t]->th.ompt_thread_info.state*/
+  if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    if (wait_id)
+      *wait_id = 0;
+    ret  = TValue(context, thread_handle->th)
+            .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED)
+            .access("ompd_thread_info")
+            .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .access("state")
+            .castBase(ompd_type_long_long)
+            .getValue(*state);
+  } else {
+    TValue ompt_thread_info =
+        TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
+            .cast("kmp_base_info_t")
+            .access("ompt_thread_info") /*__kmp_threads[t]->th.ompt_thread_info*/
+            .cast("ompt_thread_info_t");
+    if (ompt_thread_info.gotError())
+      return ompt_thread_info.getError();
+    ret = ompt_thread_info
+            .access("state") /*__kmp_threads[t]->th.ompt_thread_info.state*/
             .castBase()
-            .getValue(*wait_id);
+            .getValue(*state);
+    if (ret != ompd_rc_ok)
+      return ret;
+    ret = ompt_thread_info
+              .access("wait_id") /*__kmp_threads[t]->th.ompt_thread_info.state*/
+              .castBase()
+              .getValue(*wait_id);
+  }
   return ret;
 }
 
@@ -1098,244 +1211,6 @@ ompd_rc_t ompd_get_state(
 
 /* --- 8.1 Task Settings ---------------------------------------------------- */
 
-ompd_rc_t ompd_get_max_threads(
-    ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-    ompd_word_t *val                 /* OUT: max number of threads */
-    ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret = TValue(context, task_handle->th)
-                      .cast("kmp_taskdata_t") // td
-                      .access("td_icvs")      // td->td_icvs
-                      .cast("kmp_internal_control_t", 0)
-                      .access("nproc") // td->td_icvs.dynamic
-                      .castBase()
-                      .getValue(*val);
-
-  return ret;
-}
-
-ompd_rc_t ompd_in_parallel( // Why do we need a task context for _in_parallel?
-    ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-    ompd_word_t *val                 /* OUT: max number of threads */
-    ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-  ompd_rc_t ret;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ret = TValue(context, "__kmp_root") // __kmp_root
-            .cast("kmp_root_t", 2)
-            .dereference() // (*__kmp_root)
-            .access("r")   // (*__kmp_root)->r
-            .cast("kmp_base_root_t")
-            .access("r_in_parallel") // (*__kmp_root)->r.r_in_parallel
-            .castBase()
-            .getValue(*val);
-  if (ret != ompd_rc_ok)
-    return ret;
-  if (*val)
-    *val = 1;
-
-  return ret;
-}
-
-ompd_rc_t
-ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-              ompd_word_t *val                 /* OUT: max number of threads */
-              ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret = TValue(context, task_handle->th)
-                      .cast("kmp_taskdata_t") // td
-                      .access("td_flags")     // td->td_icvs
-                      .cast("kmp_tasking_flags_t")
-                      .check("final", val); // td->td_icvs.max_active_levels
-
-  return ret;
-}
-
-ompd_rc_t
-ompd_get_dynamic(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                 ompd_word_t *val /* OUT: max number of threads */
-                 ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret = TValue(context, task_handle->th)
-                      .cast("kmp_taskdata_t") // td
-                      .access("td_icvs")      // td->td_icvs
-                      .cast("kmp_internal_control_t", 0)
-                      .access("dynamic") // td->td_icvs.dynamic
-                      .castBase()
-                      .getValue(*val);
-
-  return ret;
-}
-
-ompd_rc_t
-ompd_get_nested(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                ompd_word_t *val /* OUT: max number of threads */
-                ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret = TValue(context, task_handle->th)
-                      .cast("kmp_taskdata_t") // td
-                      .access("td_icvs")      // td->td_icvs
-                      .cast("kmp_internal_control_t", 0)
-                      .access("nested") // td->td_icvs.nested
-                      .castBase()
-                      .getValue(*val);
-
-  return ret;
-}
-
-ompd_rc_t ompd_get_max_active_levels(
-    ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-    ompd_word_t *val                 /* OUT: max number of threads */
-    ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret =
-      TValue(context, task_handle->th)
-          .cast("kmp_taskdata_t") // td
-          .access("td_icvs")      // td->td_icvs
-          .cast("kmp_internal_control_t", 0)
-          .access("max_active_levels") // td->td_icvs.max_active_levels
-          .castBase()
-          .getValue(*val);
-
-  return ret;
-}
-
-ompd_rc_t
-ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                  ompd_word_t *kind,    /* OUT: Kind of OpenMP schedule*/
-                  ompd_word_t *modifier /* OUT: Schedunling modifier */
-                  ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  TValue sched = TValue(context, task_handle->th)
-                     .cast("kmp_taskdata_t") // td
-                     .access("td_icvs")      // td->td_icvs
-                     .cast("kmp_internal_control_t", 0)
-                     .access("sched") // td->td_icvs.sched
-                     .cast("kmp_r_sched_t", 0);
-
-  ompd_rc_t ret = sched
-                      .access("r_sched_type") // td->td_icvs.sched.r_sched_type
-                      .castBase()
-                      .getValue(*kind);
-  if (ret != ompd_rc_ok)
-    return ret;
-  ret = sched
-            .access("chunk") // td->td_icvs.sched.r_sched_type
-            .castBase()
-            .getValue(*modifier);
-  return ret;
-}
-
-ompd_rc_t
-ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                   ompd_word_t *bind /* OUT: Kind of proc-binding */
-                   ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret = TValue(context, task_handle->th)
-                      .cast("kmp_taskdata_t") // td
-                      .access("td_icvs")      // td->td_icvs
-                      .cast("kmp_internal_control_t", 0)
-                      .access("proc_bind") // td->td_icvs.proc_bind
-                      .castBase()
-                      .getValue(*bind);
-
-  return ret;
-}
-
-ompd_rc_t
-ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                 ompd_word_t *val /* OUT: max number of threads */
-                 ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  ompd_rc_t ret = TValue(context, task_handle->th)
-                      .cast("kmp_taskdata_t") // td
-                      .access("td_flags")     // td->td_flags
-                      .cast("kmp_tasking_flags_t")
-                      .check("tasktype", val); // td->td_flags.tasktype
-  *val ^= 1; // tasktype: explicit = 1, implicit = 0 => invert the value
-  return ret;
-}
-
 /* --- 8.2 OMPT Task Inquiry Analogues -------------------------------------- */
 
 ompd_rc_t ompd_get_task_frame(
@@ -1365,7 +1240,7 @@ ompd_rc_t ompd_get_task_frame(
                      .access("ompt_task_info") // td->ompt_task_info
                      .cast("ompt_task_info_t")
                      .access("frame") // td->ompd_task_info.frame
-                     .cast("ompt_frame_t", 0);
+                     .cast("omp_frame_t", 0);
   sp_reentry->segment = OMPD_SEGMENT_UNSPECIFIED;
   ompd_rc_t ret =
       frame
@@ -1385,38 +1260,6 @@ ompd_rc_t ompd_get_task_frame(
   return ret;
 }
 
-ompd_rc_t
-ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                   ompd_address_t *task_data        /* OUT: OpenMP task ID */
-                   ) {
-  if (!task_handle)
-    return ompd_rc_stale_handle;
-  if (!task_handle->ah)
-    return ompd_rc_stale_handle;
-  ompd_address_space_context_t *context = task_handle->ah->context;
-  if (!context)
-    return ompd_rc_stale_handle;
-  if (!ompd_state)
-    return ompd_rc_needs_state_tracking;
-
-  assert(callbacks && "Callback table not initialized!");
-
-  TValue taskInfo;
-  if (task_handle->lwt.address != 0)
-    taskInfo =
-        TValue(context, task_handle->lwt).cast("ompt_lw_taskteam_t", 0); /*lwt*/
-  else
-    taskInfo = TValue(context, task_handle->th).cast("kmp_taskdata_t", 0); /*t*/
-  ompd_rc_t ret = taskInfo
-                      .access("ompt_task_info") // td->ompt_task_info
-                      .cast("ompt_task_info_t")
-                      .access("task_data") // td->ompt_task_info.task_data
-                      .getAddress(task_data);
-
-  return ret;
-}
-
-#if 1 // the runtime currently does not have task function information
 ompd_rc_t ompd_get_task_function(
     ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */
     ompd_address_t *task_addr /* OUT: first instruction in the task region */
@@ -1433,35 +1276,66 @@ ompd_rc_t ompd_get_task_function(
     return ompd_rc_needs_state_tracking;
 
   assert(callbacks && "Callback table not initialized!");
-
-#if 0
-  /* We don't have a task function for implicit tasks */
-  ompd_word_t implicit;
-  ompd_rc_t ret = ompd_is_implicit (task_handle, &implicit);
-  if (ret != ompd_rc_ok)
-    return ret;
-  if (implicit)
-    return ompd_rc_bad_input;
-#else
   ompd_rc_t ret;
-#endif  
-  task_addr->segment = OMPD_SEGMENT_UNSPECIFIED;
-  TValue taskInfo;
-  if(task_handle->lwt.address!=0)
-    return ompd_rc_bad_input; // We need to decide what we do here. 
-  else
-    ret = TValue(context, task_handle->th).
-          cast("kmp_taskdata_t",0).		/*t*/
-          getArrayElement(1).                   /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */
-          cast("kmp_task_t",0).                 /* (kmp_task_t *) */
-          access("routine").             /*td->ompt_task_info*/
-          castBase().    
-          getValue(task_addr->address);
+
+  if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) {
+    task_addr->segment = OMPD_SEGMENT_UNSPECIFIED;
+    ret = TValue(context, task_handle->th)
+            .cast("omptarget_nvptx_TaskDescr", 0,
+                  OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .access("ompd_thread_info")
+            .cast("ompd_nvptx_thread_info_t", 0,
+                  OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+            .access("task_function")
+            .castBase()
+            .getValue(task_addr->address);
+
+  } else {
+    task_addr->segment = OMPD_SEGMENT_UNSPECIFIED;
+    TValue taskInfo;
+    if(task_handle->lwt.address!=0)
+      return ompd_rc_bad_input; // We need to decide what we do here.
+    else
+    {
+      ompd_word_t val;
+      ret = TValue(context, task_handle->th)
+                      .cast("kmp_taskdata_t") // td
+                      .access("td_flags")     // td->td_flags
+                      .cast("kmp_tasking_flags_t")
+                      .check("tasktype", &val); // td->td_flags.tasktype
+
+      if (ret != ompd_rc_ok)
+        return ret;
+      
+      if (val==1) { // tasktype: explicit = 1, implicit = 0
+      
+        ret = TValue(context, task_handle->th)
+              .cast("kmp_taskdata_t",0)		/*t*/
+              .getArrayElement(1)                   /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */
+              .cast("kmp_task_t",0)                 /* (kmp_task_t *) */
+              .access("routine")             /*td->ompt_task_info*/
+              .castBase()
+              .getValue(task_addr->address);
+
+      } else {
+
+        ret = TValue(context, task_handle->th)
+              .cast("kmp_taskdata_t") /*td*/
+              .access("td_team")      /*td.td_team*/
+              .cast("kmp_team_p", 1)
+              .access("t")            /*td.td_team->t*/
+              .cast("kmp_base_team_t", 0)
+              .access("t_pkfn")       /*td.td_team->t.t_pkfn*/
+              .castBase()
+              .getValue(task_addr->address);
+
+      }
+    }
+  }
   return ret;
 }
-#endif
 
-/* --- 9 OMPD Version and Compatibility Information ------------------------- */
+/* --- --- OMPD Version and Compatibility Information ----------------------- */
 
 ompd_rc_t ompd_get_api_version(ompd_word_t *version) {
   *version = OMPD_VERSION;
@@ -1479,7 +1353,7 @@ ompd_get_api_version_string(const char **string /* OUT: OMPD version string */
   return ompd_rc_ok;
 }
 
-/* --- 12 Display Control Variables ----------------------------------------- */
+/* --- 4.8 Display Control Variables ---------------------------------------- */
 
 ompd_rc_t
 ompd_get_display_control_vars(ompd_address_space_handle_t *handle,
@@ -1501,12 +1375,12 @@ ompd_rc_t initTypeSizes(ompd_address_space_context_t *context) {
   static ompd_rc_t ret;
   if (inited)
     return ret;
-  ret = callbacks->tsizeof_prim(context, &type_sizes);
+  ret = callbacks->sizeof_types(context, &type_sizes);
   if (ret != ompd_rc_ok)
     return ret;
   if (!(type_sizes.sizeof_pointer > 0))
     return ompd_rc_error;
-  ret = callbacks->tsizeof_prim(context, &TValue::type_sizes);
+  ret = callbacks->sizeof_types(context, &TValue::type_sizes);
   if (ret != ompd_rc_ok)
     return ret;
   inited = 1;
diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h
index a4cd8f785..81b652dab 100644
--- a/libompd/src/omp-debug.h
+++ b/libompd/src/omp-debug.h
@@ -13,17 +13,16 @@
 #ifdef __cplusplus
 
 #include <cstdlib>
-#include <new>
 
 #define OMPD_DLL_VERSION 201811;
 
 extern "C" {
 #endif
 
-#define OMPD_IMPLEMENTS_OPENMP 3
-#define OMPD_IMPLEMENTS_OPENMP_SUBVERSION 1
+#define OMPD_IMPLEMENTS_OPENMP 5
+#define OMPD_IMPLEMENTS_OPENMP_SUBVERSION 0
 #define OMPD_TR_VERSION 6
-#define OMPD_TR_SUBVERSION 'j'
+#define OMPD_TR_SUBVERSION 2
 #define OMPD_VERSION                                                           \
   (OMPD_IMPLEMENTS_OPENMP << 24) + (OMPD_IMPLEMENTS_OPENMP_SUBVERSION << 16) + \
       (OMPD_TR_VERSION << 8) + OMPD_TR_SUBVERSION
@@ -32,79 +31,78 @@ extern "C" {
 #define STR(x) STR_HELPER(x)
 
 #include "ompd.h"
+#include "ompd-types.h"
 
 /******************************************************************************
  * General helper functions
- */
-ompd_rc_t initTypeSizes(ompd_address_space_context_t *context);
+   */
+  ompd_rc_t initTypeSizes(ompd_address_space_context_t *context);
 
 #ifdef __cplusplus
-}
-
-static const ompd_callbacks_t *callbacks = NULL;
-
-class ompdAllocatable {
-public:
-  static void *operator new(std::size_t sz) {
-    void *res;
-    ompd_rc_t ret = callbacks->dmemory_alloc(sz, &res);
-    if (ret == ompd_rc_ok)
-      return res;
-    throw std::bad_alloc();
-  }
-  static void *operator new[](std::size_t sz) {
-    void *res;
-    ompd_rc_t ret = callbacks->dmemory_alloc(sz, &res);
-    if (ret == ompd_rc_ok)
-      return res;
-    throw std::bad_alloc();
-  }
-  void operator delete(void *addr) throw() {
-    ompd_rc_t ret = callbacks->dmemory_free(addr);
-    if (ret != ompd_rc_ok)
-      throw std::bad_alloc();
-  }
-  void operator delete[](void *addr) throw() {
-    ompd_rc_t ret = callbacks->dmemory_free(addr);
-    if (ret != ompd_rc_ok)
-      throw std::bad_alloc();
   }
-};
 
-typedef struct _ompd_address_space_context_s ompd_address_space_context_t;
 
-typedef struct _ompd_process_handle_s : public ompdAllocatable {
-  ompd_address_space_context_t *context;
-} ompd_process_handle_t;
+static const ompd_callbacks_t *callbacks = nullptr;
+
+
+// Information shared by all threads in a kernel
+// Used to map thread handles to native cuda thread ids
+typedef struct _ompd_cuda_thread_kernel_info_s {
+  ompd_addr_t cudaDevId;
+  ompd_addr_t cudaContext;
+  ompd_addr_t warpSize;
+  ompd_addr_t gridId;
+  ompd_dim3_t gridDim;
+  ompd_dim3_t blockDim;
+} ompd_cuda_thread_kernel_info_t;
 
-typedef struct _ompd_address_space_handle_s : public ompdAllocatable {
+typedef struct _ompd_address_space_context_s ompd_address_space_context_t;
+
+typedef struct _ompd_address_space_handle_s {
   ompd_address_space_context_t *context;
-  ompd_device_kind_t kind;
-  ompd_device_identifier_t id;
+  ompd_device_t kind;
+  uint64_t id;
 } ompd_address_space_handle_t;
 
-typedef struct _ompd_device_handle_s : public ompdAllocatable {
-  ompd_address_space_handle_t *ah;
-  ompd_address_t th; /* target handle */
-} ompd_device_handle_t;
-
-typedef struct _ompd_thread_handle_s : public ompdAllocatable {
+typedef struct _ompd_thread_handle_s {
   ompd_address_space_handle_t *ah;
+  ompd_thread_context_t *thread_context;
   ompd_address_t th; /* target handle */
+  ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* only valid for cuda */
 } ompd_thread_handle_t;
 
-typedef struct _ompd_parallel_handle_s : public ompdAllocatable {
+typedef struct _ompd_parallel_handle_s {
   ompd_address_space_handle_t *ah;
   ompd_address_t th;  /* target handle */
   ompd_address_t lwt; /* lwt handle */
+  ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* copied from the thread
+                                                       used to retrieve this
+                                                       parallel region handle
+                                                     */
 } ompd_parallel_handle_t;
 
-typedef struct _ompd_task_handle_s : public ompdAllocatable {
+typedef struct _ompd_task_handle_s {
   ompd_address_space_handle_t *ah;
   ompd_address_t th;  /* target handle */
   ompd_address_t lwt; /* lwt handle */
+  ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* copied from the thread
+                                                       used to retrieve this
+                                                       parallel region handle
+                                                     */
+  _ompd_task_handle_s(){
+    ah=NULL;
+    th.segment=OMPD_SEGMENT_UNSPECIFIED; 
+    lwt.segment=OMPD_SEGMENT_UNSPECIFIED; 
+    th.address=0;
+    lwt.address=0;
+    cuda_kernel_info=NULL;
+  }
 } ompd_task_handle_t;
 
 #endif
 
+// TODO (mr) this is ugly, but better then a global symbol (?)
+void __ompd_init_icvs(const ompd_callbacks_t *table);
+void __ompd_init_states(const ompd_callbacks_t *table);
+
 #endif /* SRC_OMP_DEBUG_H_ */
diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp
new file mode 100644
index 000000000..72598ad09
--- /dev/null
+++ b/libompd/src/omp-icv.cpp
@@ -0,0 +1,512 @@
+#include "omp-debug.h"
+#include "ompd-private.h"
+#include "TargetValue.h"
+
+#define FOREACH_OMPD_ICV(macro)                                                \
+    macro (levels_var, "levels-var", ompd_scope_parallel, 1)                   \
+    macro (active_levels_var, "active-levels-var", ompd_scope_parallel, 0)     \
+    macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space, 0)  \
+    macro (max_active_levels_var, "max-active-levels-var", ompd_scope_task, 0) \
+    macro (bind_var, "bind-var", ompd_scope_task, 0)                           \
+    macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space, 0)   \
+    macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread, 1)        \
+    macro (final_var, "ompd-final-var", ompd_scope_task, 0)                    \
+    macro (implicit_var, "ompd-implicit-var", ompd_scope_task, 0)              \
+    macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel, 1)        \
+
+void __ompd_init_icvs(const ompd_callbacks_t *table) {
+  callbacks = table;
+}
+
+enum ompd_icv {
+  ompd_icv_undefined_marker = 0, // ompd_icv_undefined is already defined in ompd.h
+#define ompd_icv_macro(v, n, s, d) ompd_icv_ ## v,
+  FOREACH_OMPD_ICV(ompd_icv_macro)
+#undef ompd_icv_macro
+  ompd_icv_after_last_icv
+};
+
+static const char *ompd_icv_string_values[] = {
+  "undefined",
+#define ompd_icv_macro(v, n, s, d) n,
+  FOREACH_OMPD_ICV(ompd_icv_macro)
+#undef ompd_icv_macro
+};
+
+static const ompd_scope_t ompd_icv_scope_values[] = {
+  ompd_scope_global,  // undefined marker
+#define ompd_icv_macro(v, n, s, d) s,
+  FOREACH_OMPD_ICV(ompd_icv_macro)
+#undef ompd_icv_macro
+};
+
+static const uint8_t ompd_icv_available_cuda[] = {
+  1, // undefined marker
+#define ompd_icv_macro(v, n, s, d) d,
+  FOREACH_OMPD_ICV(ompd_icv_macro)
+#undef ompd_icv_macro
+ 1, // icv after last icv marker
+};
+
+
+static ompd_rc_t ompd_enumerate_icvs_cuda(ompd_icv_id_t current,
+                                          ompd_icv_id_t *next_id,
+                                          const char **next_icv_name,
+                                          ompd_scope_t *next_scope,
+                                          int *more) {
+  int next_possible_icv = current;
+  do {
+    next_possible_icv++;
+  } while (!ompd_icv_available_cuda[next_possible_icv]);
+
+  if (next_possible_icv >= ompd_icv_after_last_icv) {
+    return ompd_rc_bad_input;
+  }
+
+  *next_id = next_possible_icv;
+  *next_icv_name = ompd_icv_string_values[*next_id];
+  *next_scope = ompd_icv_scope_values[*next_id];
+
+  do {
+    next_possible_icv++;
+  } while (!ompd_icv_available_cuda[next_possible_icv]);
+
+  if (next_possible_icv >= ompd_icv_after_last_icv) {
+    *more = 0;
+  } else {
+    *more = 1;
+  }
+  return ompd_rc_ok;
+}
+
+ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle,
+                              ompd_icv_id_t current, ompd_icv_id_t *next_id,
+                              const char **next_icv_name,
+                              ompd_scope_t *next_scope,
+                              int *more) {
+  if (!handle) {
+    return ompd_rc_stale_handle;
+  }
+  if (handle->kind == OMPD_DEVICE_KIND_CUDA) {
+    return ompd_enumerate_icvs_cuda(current, next_id, next_icv_name,
+                                    next_scope, more);
+  }
+  if (current + 1 >= ompd_icv_after_last_icv) {
+    return ompd_rc_bad_input;
+  }
+
+  *next_id = current + 1;
+  *next_icv_name = ompd_icv_string_values[*next_id];
+  *next_scope = ompd_icv_scope_values[*next_id];
+
+  if ((*next_id) + 1 >= ompd_icv_after_last_icv) {
+    *more = 0;
+  } else {
+    *more = 1;
+  }
+
+  return ompd_rc_ok;
+}
+
+
+static ompd_rc_t ompd_get_level(
+    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
+    ompd_word_t *val                         /* OUT: nesting level */
+    ) {
+  if (!parallel_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = parallel_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  uint32_t res;
+
+  ompd_rc_t ret = TValue(context, parallel_handle->th)
+                      .cast("kmp_base_team_t", 0) /*t*/
+                      .access("t_level")          /*t.t_level*/
+                      .castBase()
+                      .getValue(res);
+  *val = res;
+  return ret;
+}
+
+
+static ompd_rc_t ompd_get_level_cuda(
+    ompd_parallel_handle_t *parallel_handle,
+    ompd_word_t *val) {
+  if (!parallel_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = parallel_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized");
+
+  uint16_t res;
+  ompd_rc_t ret = TValue(context, parallel_handle->th)
+                      .cast("ompd_nvptx_parallel_info_t", 0,
+                            OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                      .access("level")
+                      .castBase(ompd_type_short)
+                      .getValue(res);
+  *val = res;
+  return ret;
+}
+
+
+static ompd_rc_t ompd_get_active_level(
+    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
+    ompd_word_t *val                         /* OUT: active nesting level */
+    ) {
+  if (!parallel_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = parallel_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  uint32_t res;
+
+  ompd_rc_t ret = TValue(context, parallel_handle->th)
+                      .cast("kmp_base_team_t", 0) /*t*/
+                      .access("t_active_level")   /*t.t_active_level*/
+                      .castBase()
+                      .getValue(res);
+  *val = res;
+  return ret;
+}
+ 
+
+static ompd_rc_t
+ompd_get_num_procs(ompd_address_space_handle_t
+                       *addr_handle, /* IN: handle for the address space */
+                   ompd_word_t *val  /* OUT: number of processes */
+                   ) {
+  ompd_address_space_context_t *context = addr_handle->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+  ompd_rc_t ret;
+
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  int nth;
+  ret = TValue(context, "__kmp_avail_proc")
+            .castBase("__kmp_avail_proc")
+            .getValue(nth);
+  *val = nth;
+  return ret;
+}
+
+static ompd_rc_t
+ompd_get_thread_limit(ompd_address_space_handle_t
+                          *addr_handle, /* IN: handle for the address space */
+                      ompd_word_t *val  /* OUT: max number of threads */
+                      ) {
+  if (!addr_handle)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = addr_handle->context;
+  ompd_rc_t ret;
+
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  int nth;
+  ret =
+      TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth);
+  *val = nth;
+  return ret;
+} 
+
+static ompd_rc_t ompd_get_thread_num(
+    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
+    ompd_word_t *val /* OUT: number of the thread within the team */
+    ) {
+  // __kmp_threads[8]->th.th_info.ds.ds_tid
+  if (!thread_handle)
+    return ompd_rc_stale_handle;
+  if (!thread_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = thread_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  ompd_rc_t ret =
+      TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/
+          .cast("kmp_base_info_t")
+          .access("th_info") /*__kmp_threads[t]->th.th_info*/
+          .cast("kmp_desc_t")
+          .access("ds") /*__kmp_threads[t]->th.th_info.ds*/
+          .cast("kmp_desc_base_t")
+          .access("ds_tid") /*__kmp_threads[t]->th.th_info.ds.ds_tid*/
+          .castBase()
+          .getValue(*val);
+  return ret;
+} 
+
+static ompd_rc_t
+ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
+              ompd_word_t *val                 /* OUT: max number of threads */
+              ) {
+  if (!task_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = task_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  ompd_rc_t ret = TValue(context, task_handle->th)
+                      .cast("kmp_taskdata_t") // td
+                      .access("td_flags")     // td->td_icvs
+                      .cast("kmp_tasking_flags_t")
+                      .check("final", val); // td->td_icvs.max_active_levels
+
+  return ret;
+}
+
+static ompd_rc_t
+ompd_get_max_active_levels(
+    ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
+    ompd_word_t *val                 /* OUT: max number of threads */
+    ) {
+  if (!task_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = task_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  ompd_rc_t ret =
+      TValue(context, task_handle->th)
+          .cast("kmp_taskdata_t") // td
+          .access("td_icvs")      // td->td_icvs
+          .cast("kmp_internal_control_t", 0)
+          .access("max_active_levels") // td->td_icvs.max_active_levels
+          .castBase()
+          .getValue(*val);
+
+  return ret;
+}
+
+static ompd_rc_t
+ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
+                  ompd_word_t *kind,    /* OUT: Kind of OpenMP schedule*/
+                  ompd_word_t *modifier /* OUT: Schedunling modifier */
+                  ) {
+  if (!task_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = task_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  TValue sched = TValue(context, task_handle->th)
+                     .cast("kmp_taskdata_t") // td
+                     .access("td_icvs")      // td->td_icvs
+                     .cast("kmp_internal_control_t", 0)
+                     .access("sched") // td->td_icvs.sched
+                     .cast("kmp_r_sched_t", 0);
+
+  ompd_rc_t ret = sched
+                      .access("r_sched_type") // td->td_icvs.sched.r_sched_type
+                      .castBase()
+                      .getValue(*kind);
+  if (ret != ompd_rc_ok)
+    return ret;
+  ret = sched
+            .access("chunk") // td->td_icvs.sched.r_sched_type
+            .castBase()
+            .getValue(*modifier);
+  return ret;
+}
+
+static ompd_rc_t
+ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
+                   ompd_word_t *bind /* OUT: Kind of proc-binding */
+                   ) {
+  if (!task_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = task_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  ompd_rc_t ret = TValue(context, task_handle->th)
+                      .cast("kmp_taskdata_t") // td
+                      .access("td_icvs")      // td->td_icvs
+                      .cast("kmp_internal_control_t", 0)
+                      .access("proc_bind") // td->td_icvs.proc_bind
+                      .castBase()
+                      .getValue(*bind);
+
+  return ret;
+}
+
+
+static ompd_rc_t
+ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
+                 ompd_word_t *val /* OUT: max number of threads */
+                 ) {
+  if (!task_handle)
+    return ompd_rc_stale_handle;
+  if (!task_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = task_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  ompd_rc_t ret = TValue(context, task_handle->th)
+                      .cast("kmp_taskdata_t") // td
+                      .access("td_flags")     // td->td_flags
+                      .cast("kmp_tasking_flags_t")
+                      .check("tasktype", val); // td->td_flags.tasktype
+  *val ^= 1; // tasktype: explicit = 1, implicit = 0 => invert the value
+  return ret;
+} 
+
+static ompd_rc_t
+ompd_get_num_threads(ompd_parallel_handle_t
+                        *parallel_handle, /* IN: OpenMP parallel handle */
+                     ompd_word_t *val     /* OUT: number of threads */
+                    ) {
+  if (!parallel_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = parallel_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized!");
+
+  ompd_rc_t ret = ompd_rc_ok;
+  if (parallel_handle->lwt.address != 0) {
+    *val = 1;
+  } else {
+    uint32_t res;
+    ret = TValue(context, parallel_handle->th)
+            .cast("kmp_base_team_t", 0) /*t*/
+            .access("t_nproc")          /*t.t_nproc*/
+            .castBase()
+            .getValue(res);
+   *val = res;
+ }
+ return ret;
+}
+
+static ompd_rc_t
+ompd_get_num_threads_cuda(ompd_parallel_handle_t *parallel_handle,
+                          ompd_word_t *val) {
+  if (!parallel_handle->ah)
+    return ompd_rc_stale_handle;
+  ompd_address_space_context_t *context = parallel_handle->ah->context;
+  if (!context)
+    return ompd_rc_stale_handle;
+
+  assert(callbacks && "Callback table not initialized");
+
+  uint16_t res;
+
+  ompd_rc_t ret = TValue(context, parallel_handle->th)
+                      .cast("ompd_nvptx_parallel_info_t", 0,
+                            OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                      .access("parallel_tasks")
+                      .cast("omptarget_nvptx_TaskDescr", 1,
+                            OMPD_SEGMENT_CUDA_PTX_GLOBAL)
+                      .access("items__threadsInTeam")
+                      .castBase()
+                      .getValue(res);
+  *val = res;
+  return ret;
+}
+
+ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope,
+                                  ompd_icv_id_t icv_id,
+                                  ompd_word_t *icv_value) {
+  if (!handle) {
+    return ompd_rc_stale_handle;
+  }
+  if (icv_id >= ompd_icv_after_last_icv || icv_id == 0) {
+    return ompd_rc_bad_input;
+  }
+  if (scope != ompd_icv_scope_values[icv_id]) {
+    return ompd_rc_bad_input;
+  }
+
+  ompd_device_t device_kind;
+
+  switch (scope) {
+    case ompd_scope_thread:
+      device_kind = ((ompd_thread_handle_t *)handle)->ah->kind;
+      break;
+    case ompd_scope_parallel:
+      device_kind = ((ompd_parallel_handle_t *)handle)->ah->kind;
+      break;
+    case ompd_scope_address_space:
+      device_kind = ((ompd_address_space_handle_t *)handle)->kind;
+      break;
+    case ompd_scope_task:
+      device_kind = ((ompd_task_handle_t *)handle)->ah->kind;
+      break;
+    default:
+      return ompd_rc_bad_input;
+  }
+
+
+  if (device_kind == OMPD_DEVICE_KIND_HOST) {
+    switch (icv_id) {
+      case ompd_icv_levels_var:
+        return ompd_get_level((ompd_parallel_handle_t *)handle, icv_value);
+      case ompd_icv_active_levels_var:
+        return ompd_get_active_level((ompd_parallel_handle_t *)handle, icv_value);
+      case ompd_icv_thread_limit_var:
+        return ompd_get_thread_limit((ompd_address_space_handle_t*)handle, icv_value);
+      case ompd_icv_max_active_levels_var:
+        return ompd_get_max_active_levels((ompd_task_handle_t*)handle, icv_value);
+      case ompd_icv_bind_var:
+        return ompd_get_proc_bind((ompd_task_handle_t*)handle, icv_value);
+      case ompd_icv_num_procs_var:
+        return ompd_get_num_procs((ompd_address_space_handle_t*)handle, icv_value);
+      case ompd_icv_thread_num_var:
+        return ompd_get_thread_num((ompd_thread_handle_t*)handle, icv_value);
+      case ompd_icv_final_var:
+        return ompd_in_final((ompd_task_handle_t*)handle, icv_value);
+      case ompd_icv_implicit_var:
+        return ompd_is_implicit((ompd_task_handle_t*)handle, icv_value);
+      case ompd_icv_team_size_var:
+        return ompd_get_num_threads((ompd_parallel_handle_t*)handle, icv_value);
+      default:
+        return ompd_rc_unsupported;
+    }
+  } else if (device_kind == OMPD_DEVICE_KIND_CUDA) {
+    switch (icv_id) {
+      case ompd_icv_levels_var:
+        return ompd_get_level_cuda((ompd_parallel_handle_t *)handle, icv_value);
+      case ompd_icv_team_size_var:
+        return ompd_get_num_threads_cuda((ompd_parallel_handle_t*)handle, icv_value);
+      default:
+        return ompd_rc_unsupported;
+    }
+  }
+  return ompd_rc_unsupported;
+}
+
+ompd_rc_t
+ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope,
+                               ompd_icv_id_t icv_id,
+                               const char **icv_string) {
+  return ompd_rc_unsupported;
+}
diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp
new file mode 100644
index 000000000..0e64aad95
--- /dev/null
+++ b/libompd/src/omp-state.cpp
@@ -0,0 +1,97 @@
+#include "ompd.h"
+#include "ompd-private.h"
+#include "omp-debug.h"
+#include <cstring>
+
+void __ompd_init_states(const ompd_callbacks_t *table) {
+  callbacks = table;
+}
+
+static const char *get_ompd_state_name(ompd_word_t state) {
+  switch (state) {
+#define ompd_state_macro(state, code) \
+    case code: return #state ;
+  FOREACH_OMP_STATE(ompd_state_macro)
+#undef ompd_state_macro
+    default: return NULL;
+  }
+}
+
+static const char *get_ompd_cuda_state_name(ompd_word_t state) {
+  switch (state) {
+    case omp_state_work_serial:
+      return "omp_state_work_serial";
+    case omp_state_work_parallel:
+      return "omp_state_work_parallel";
+    case omp_state_work_reduction:
+      return "omp_state_work_reduction";
+    default:
+      return NULL;
+  }
+}
+
+ompd_rc_t ompd_enumerate_states(
+    ompd_address_space_handle_t *address_space_handle,
+    ompd_word_t current_state, ompd_word_t *next_state,
+    const char **next_state_name, ompd_word_t *more_enums) {
+  ompd_rc_t ret;
+  if (address_space_handle->kind == OMPD_DEVICE_KIND_CUDA) {
+    // We only support a small number of states for cuda devices
+    *more_enums = 1;
+    switch (current_state) {
+      case omp_state_undefined:
+        *next_state = omp_state_work_serial;
+        break;
+      case omp_state_work_serial:
+        *next_state = omp_state_work_parallel;
+        break;
+      case omp_state_work_parallel:
+        *next_state = omp_state_work_reduction;
+        *more_enums = 0;
+        break;
+      default:
+        return ompd_rc_bad_input;
+    }
+    const char *find_next_state_name = get_ompd_cuda_state_name(*next_state);
+    char *next_state_name_cpy;
+    ret = callbacks->memory_alloc(
+        strlen(find_next_state_name) + 1, (void **)&next_state_name_cpy);
+    if (ret != ompd_rc_ok) {
+      return ret;
+    }
+    strcpy(next_state_name_cpy, get_ompd_cuda_state_name(*next_state));
+    *next_state_name = next_state_name_cpy;
+  }  else {
+    if (current_state > omp_state_undefined && 
+        current_state >= OMPD_LAST_OMP_STATE) {
+      return ompd_rc_bad_input;
+    }
+    if (current_state == omp_state_undefined) {
+      (*next_state) = omp_state_work_serial;
+      (*next_state_name) = get_ompd_state_name(omp_state_work_serial);
+      (*more_enums) = 1;
+      return ompd_rc_ok;
+    }
+    const char *find_next_state_name;
+    *next_state = current_state + 1;
+    while (!(find_next_state_name = get_ompd_state_name(*next_state))) {
+      ++(*next_state);
+    }
+    
+    char *next_state_name_cpy;
+    ret = callbacks->memory_alloc(strlen(find_next_state_name) + 1, (void **)&next_state_name_cpy);
+    if (ret != ompd_rc_ok) {
+      return ret;
+    }
+    strcpy(next_state_name_cpy, find_next_state_name);
+    
+    *next_state_name = next_state_name_cpy;
+    
+    if (*next_state == OMPD_LAST_OMP_STATE) {
+      *more_enums = 0;
+    } else {
+      *more_enums = 1;
+    }
+  }
+  return ompd_rc_ok;
+}
diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h
new file mode 100644
index 000000000..bc5a04794
--- /dev/null
+++ b/libompd/src/ompd-private.h
@@ -0,0 +1,74 @@
+#ifndef SRC_OMPD_PRIVATE_H_
+#define SRC_OMPD_PRIVATE_H_
+
+
+/*
+ * Definition of OMPD states, taken from OMPT
+ */
+#define FOREACH_OMP_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (omp_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (omp_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (omp_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (omp_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (omp_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (omp_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (omp_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (omp_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (omp_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (omp_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (omp_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (omp_state_wait_mutex, 0x040)                                                         \
+    macro (omp_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (omp_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (omp_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (omp_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (omp_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (omp_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (omp_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (omp_state_idle, 0x100)           /* waiting for work */                              \
+    macro (omp_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+typedef enum omp_state_t {
+#define ompd_state_macro(state, code) state = code,
+  FOREACH_OMP_STATE(ompd_state_macro)
+#undef ompd_state_macro
+} omp_state_t;
+
+#define OMPD_LAST_OMP_STATE omp_state_overhead
+
+
+/**
+ * Primitive types.
+ */
+typedef enum ompd_target_prim_types_t {
+  ompd_type_invalid = -1,
+  ompd_type_char = 0,
+  ompd_type_short = 1,
+  ompd_type_int = 2,
+  ompd_type_long = 3,
+  ompd_type_long_long = 4,
+  ompd_type_pointer = 5,
+  ompd_type_max
+} ompd_target_prim_types_t;
+
+#include "ompd-types.h"
+
+#endif /*SRC_OMPD_PRIVATE_H*/
diff --git a/libompd/src/ompd-types.h b/libompd/src/ompd-types.h
new file mode 100644
index 000000000..ea5aedef4
--- /dev/null
+++ b/libompd/src/ompd-types.h
@@ -0,0 +1,65 @@
+/*
+* @@name:   ompd_types.h
+*/
+#ifndef __OPMD_TYPES_H
+#define __OPMD_TYPES_H
+#include "ompd.h"
+
+#define OMPD_TYPES_VERSION   20170927 /* YYYYMMDD Format */
+
+/* Kinds of device threads  */
+#define OMPD_THREAD_ID_PTHREAD      ((ompd_thread_id_t)0)
+#define OMPD_THREAD_ID_LWP          ((ompd_thread_id_t)1)
+#define OMPD_THREAD_ID_WINTHREAD    ((ompd_thread_id_t)2)
+#define OMPD_THREAD_ID_CUDALOGICAL  ((ompd_thread_id_t)3)
+/* The range of non-standard implementation defined values */
+#define OMPD_THREAD_ID_LO       ((ompd_thread_id_t)1000000)
+#define OMPD_THREAD_ID_HI       ((ompd_thread_id_t)1100000)
+
+/* Target Cuda device-specific thread identification */
+typedef struct ompd_dim3_t {
+    ompd_addr_t x;
+    ompd_addr_t y;
+    ompd_addr_t z;
+} ompd_dim3_t;
+
+typedef struct ompd_cudathread_coord_t {
+    ompd_addr_t cudaDevId;
+    ompd_addr_t cudaContext;
+    ompd_addr_t warpSize;
+    ompd_addr_t gridId;
+    ompd_dim3_t  gridDim;
+    ompd_dim3_t  blockDim;
+    ompd_dim3_t  blockIdx;
+    ompd_dim3_t  threadIdx;
+} ompd_cudathread_coord_t;
+
+/* Memory Access Segment definitions for Host and Target Devices */
+#define OMPD_SEGMENT_UNSPECIFIED             ((ompd_seg_t)0)
+
+/* Cuda-specific values consistent with those defined in cudadebugger.h */
+#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED    ((ompd_seg_t)0)
+#define OMPD_SEGMENT_CUDA_PTX_CODE           ((ompd_seg_t)1)
+#define OMPD_SEGMENT_CUDA_PTX_REG            ((ompd_seg_t)2)
+#define OMPD_SEGMENT_CUDA_PTX_SREG           ((ompd_seg_t)3)
+#define OMPD_SEGMENT_CUDA_PTX_CONST          ((ompd_seg_t)4)
+#define OMPD_SEGMENT_CUDA_PTX_GLOBAL         ((ompd_seg_t)5)
+#define OMPD_SEGMENT_CUDA_PTX_LOCAL          ((ompd_seg_t)6)
+#define OMPD_SEGMENT_CUDA_PTX_PARAM          ((ompd_seg_t)7)
+#define OMPD_SEGMENT_CUDA_PTX_SHARED         ((ompd_seg_t)8)
+#define OMPD_SEGMENT_CUDA_PTX_SURF           ((ompd_seg_t)9)
+#define OMPD_SEGMENT_CUDA_PTX_TEX            ((ompd_seg_t)10)
+#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER     ((ompd_seg_t)11)
+#define OMPD_SEGMENT_CUDA_PTX_GENERIC        ((ompd_seg_t)12)
+#define OMPD_SEGMENT_CUDA_PTX_IPARAM         ((ompd_seg_t)13)
+#define OMPD_SEGMENT_CUDA_PTX_OPARAM         ((ompd_seg_t)14)
+#define OMPD_SEGMENT_CUDA_PTX_FRAME          ((ompd_seg_t)15)
+
+/* Kinds of device device address spaces */
+#define OMPD_DEVICE_KIND_HOST     ((ompd_device_t)1)
+#define OMPD_DEVICE_KIND_CUDA     ((ompd_device_t)2)
+/* The range of non-standard implementation defined values */
+#define OMPD_DEVICE_IMPL_LO       ((ompd_device_t)1000000)
+#define OMPD_DEVICE_IMPL_HI ((ompd_device_t)1100000)
+#endif
+
diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h
index 2c97f09f4..48ae79e98 100644
--- a/libompd/src/ompd.h
+++ b/libompd/src/ompd.h
@@ -21,10 +21,6 @@
  *    - Type entities end with the suffix "_t" (for type)
  *    - Function types end with the suffix "_fn_t" (for function type)
  *    - Return code entities have "_rc_" in it
- *    - Abstractions referring to the target have the prefix "t" (e.g.,
- *    "tmemory" for memory in the target, or "tsymbol" for symbol in the target)
- *    - Abstractions referring to the debugger have the prefix "d" (e.g.,
- *    "dmemory" for memory in the debugger)
  *
  * Comment conventions:
  *    - Input function parameters denoted by "IN:"
@@ -60,102 +56,11 @@ typedef struct ompd_address_t {
   ompd_addr_t address; /* target address in the segment */
 } ompd_address_t;
 
-#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0)
-#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1)
-#define OMPD_SEGMENT_DATA ((ompd_seg_t)2)
+const uint64_t ompd_segment_none = 0;
 
-/**
- * The following definitions match with ptx information stored in DWARF
- */
-#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0)
-#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1)
-#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2)
-#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3)
-#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4)
-#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5)
-#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6)
-#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7)
-#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8)
-#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9)
-#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10)
-#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11)
-#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12)
-#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13)
-#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14)
-#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15)
-#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16)
-
-#if 0 // types removed in Austin F2F
-/*
- * Definition of OMPD states, taken from OMPT
- */
-#define FOREACH_OMPD_STATE(macro)                                              \
-                                                                               \
-  /* first */                                                                  \
-  macro(ompd_state_first, 0x71) /* initial enumeration state */                \
-                                                                               \
-      /* work states (0..15) */                                                \
-      macro(ompd_state_work_serial, 0x00)    /* working outside parallel */    \
-      macro(ompd_state_work_parallel, 0x01)  /* working within parallel */     \
-      macro(ompd_state_work_reduction, 0x02) /* performing a reduction */      \
-                                                                               \
-      /* idle (16..31) */                                                      \
-      macro(ompd_state_idle, 0x10) /* waiting for work */                      \
-                                                                               \
-      /* overhead states (32..63) */                                           \
-      macro(ompd_state_overhead, 0x20) /* overhead excluding wait states */    \
-                                                                               \
-      /* barrier wait states (64..79) */                                       \
-      macro(ompd_state_wait_barrier, 0x40)          /* waiting at a barrier */ \
-      macro(ompd_state_wait_barrier_implicit, 0x41) /* implicit barrier */     \
-      macro(ompd_state_wait_barrier_explicit, 0x42) /* explicit barrier */     \
-                                                                               \
-      /* task wait states (80..95) */                                          \
-      macro(ompd_state_wait_taskwait, 0x50)  /* waiting at a taskwait */       \
-      macro(ompd_state_wait_taskgroup, 0x51) /* waiting at a taskgroup */      \
-                                                                               \
-      /* mutex wait states (96..111) */                                        \
-      macro(ompd_state_wait_lock, 0x60)      /* waiting for lock */            \
-      macro(ompd_state_wait_nest_lock, 0x61) /* waiting for nest lock */       \
-      macro(ompd_state_wait_critical, 0x62)  /* waiting for critical */        \
-      macro(ompd_state_wait_atomic, 0x63)    /* waiting for atomic */          \
-      macro(ompd_state_wait_ordered, 0x64)   /* waiting for ordered */         \
-      macro(ompd_state_wait_single,                                            \
-            0x6F) /* waiting for single region (non-standard!) */              \
-                                                                               \
-      /* misc (112..127) */                                                    \
-      macro(ompd_state_undefined, 0x70) /* undefined thread state */
-
-typedef enum ompd_state_t {
-#define ompd_state_macro(state, code) state = code,
-  FOREACH_OMPD_STATE(ompd_state_macro)
-#undef ompd_state_macro
-} ompd_state_t;
-
-typedef enum ompd_sched_t {
-  ompd_sched_static = 1,
-  ompd_sched_dynamic = 2,
-  ompd_sched_guided = 3,
-  ompd_sched_auto = 4,
-  ompd_sched_vendor_lo = 5,
-  ompd_sched_vendor_hi = 0x7fffffff
-} ompd_sched_t;
-
-typedef enum ompd_proc_bind_t {
-  ompd_proc_bind_false = 0,
-  ompd_proc_bind_true = 1,
-  ompd_proc_bind_master = 2,
-  ompd_proc_bind_close = 3,
-  ompd_proc_bind_spread = 4
-} ompd_proc_bind_t;
-#endif
-
-typedef uint64_t ompd_device_identifier_t;
-
-typedef enum ompd_device_kind_t {
-  ompd_device_kind_host = 1,
-  ompd_device_kind_cuda = 2
-} ompd_device_kind_t;
+/* types for device and thread id KIND, not for the actual thread/device id */
+typedef uint64_t ompd_device_t;
+typedef uint64_t ompd_thread_id_t;
 
 /**
  * Context handle.
@@ -185,42 +90,19 @@ typedef struct _ompd_task_handle_s ompd_task_handle_t;
 typedef struct _ompd_address_space_handle_s ompd_address_space_handle_t;
 
 /**
- * Other handles.
+ * Scope for ICVs
  */
-#define OMPD_THREAD_ID_PTHREAD 0
-#define OMPD_THREAD_ID_LWP 1
-#define OMPD_THREAD_ID_WINTHREAD 2
-#define OMPD_THREAD_ID_CUDALOGICAL 3
-#define OMPD_THREAD_ID_MAX 4
-
-typedef enum ompd_thread_id_kind_t {
-  ompd_thread_id_pthread = 0,
-  ompd_thread_id_lwp = 1,
-  ompd_thread_id_winthread = 2,
-  ompd_thread_id_cudalogical = 3
-} ompd_thread_id_kind_t;
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
 
-/**
- * Logical coordinates of OMP target device threads
- */
-typedef struct ompd_dim3_t {
-  ompd_word_t x;
-  ompd_word_t y;
-  ompd_word_t z;
-} ompd_dim3_t;
-
-typedef struct ompd_cudathread_coord_t {
-  ompd_addr_t cudaDevId;
-  ompd_addr_t cudaContext;
-  ompd_addr_t warpSize;
-  ompd_addr_t gridId;
-  ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work
-                        // with grids too well.
-  ompd_dim3_t gridDim;
-  ompd_dim3_t blockDim;
-  ompd_dim3_t blockIdx;
-  ompd_dim3_t threadIdx;
-} ompd_cudathread_coord_t;
+typedef uint64_t ompd_icv_id_t;
+const uint64_t ompd_icv_undefined = 0;
 
 /**
  * Return codes.
@@ -240,33 +122,19 @@ typedef enum ompd_rc_t {
   ompd_rc_nomem = 10              /* unable to allocate memory */
 } ompd_rc_t;
 
-/**
- * Primitive types.
- */
-typedef enum ompd_target_prim_types_t {
-  ompd_type_invalid = -1,
-  ompd_type_char = 0,
-  ompd_type_short = 1,
-  ompd_type_int = 2,
-  ompd_type_long = 3,
-  ompd_type_long_long = 4,
-  ompd_type_pointer = 5,
-  ompd_type_max
-} ompd_target_prim_types_t;
-
 /**
  * Primitive type sizes.
  * These types are used by OMPD to interrogate the debugger about the size of
  * primitive types in the target.
  */
-typedef struct ompd_target_type_sizes_t {
+typedef struct ompd_device_type_sizes_t {
   uint8_t sizeof_char;
   uint8_t sizeof_short;
   uint8_t sizeof_int;
   uint8_t sizeof_long;
   uint8_t sizeof_long_long;
   uint8_t sizeof_pointer;
-} ompd_target_type_sizes_t;
+} ompd_device_type_sizes_t;
 
 /******************************************************************************
  * Debugger callback signatures.
@@ -280,7 +148,7 @@ typedef struct ompd_target_type_sizes_t {
 /**
  * Allocate memory in the debugger's address space.
  */
-typedef ompd_rc_t (*ompd_dmemory_alloc_fn_t)(
+typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)(
     ompd_size_t bytes, /* IN: bytes of the primitive type */
     void **ptr         /* OUT: pointer of the allocated memory */
     );
@@ -288,43 +156,31 @@ typedef ompd_rc_t (*ompd_dmemory_alloc_fn_t)(
 /**
  * Free memory in the debugger's address space.
  */
-typedef ompd_rc_t (*ompd_dmemory_free_fn_t)(
+typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)(
     void *ptr /* IN: pointer of memory to deallocate */
     );
 
 /**
  * Get thread specific context.
  */
-typedef ompd_rc_t (*ompd_get_thread_context_for_thread_id_fn_t)(
-    ompd_address_space_context_t *context, ompd_thread_id_kind_t kind,
+typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)(
+    ompd_address_space_context_t *context, ompd_thread_id_t kind,
     ompd_size_t sizeof_thread_id, const void *thread_id,
     ompd_thread_context_t **thread_context);
 
-#if 0
-/**
- * Get containing (host) process context for address_space_context
- */
-typedef ompd_rc_t (*ompd_get_process_context_for_context_fn_t) (
-    ompd_address_space_context_t*
-      address_space_context,           /* IN: OMP device/process addr space */
-    ompd_address_space_context_t**
-      containing_address_space_context /* OUT: Containing omp process addr space */
-);
-#endif
-
 /**
  * Look up the sizes of primitive types in the target
  */
-typedef ompd_rc_t (*ompd_tsizeof_prim_fn_t)(
+typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)(
     ompd_address_space_context_t
         *context,                   /* IN: debugger handle for the target */
-    ompd_target_type_sizes_t *sizes /* OUT: type sizes */
+    ompd_device_type_sizes_t *sizes /* OUT: type sizes */
     );
 
 /**
  * Look up the address of a global symbol in the target
  */
-typedef ompd_rc_t (*ompd_tsymbol_addr_fn_t)(
+typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)(
     ompd_address_space_context_t
         *context, /* IN: debugger handle for the target */
     ompd_thread_context_t
@@ -336,7 +192,7 @@ typedef ompd_rc_t (*ompd_tsymbol_addr_fn_t)(
 /**
  * Read memory from the target
  */
-typedef ompd_rc_t (*ompd_tmemory_read_fn_t)(
+typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)(
     ompd_address_space_context_t
         *context, /* IN: debugger handle for the target */
     ompd_thread_context_t
@@ -349,7 +205,7 @@ typedef ompd_rc_t (*ompd_tmemory_read_fn_t)(
 /**
  * Write memory from the target
  */
-typedef ompd_rc_t (*ompd_tmemory_write_fn_t)(
+typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)(
     ompd_address_space_context_t
         *context, /* IN: debugger handle for the target */
     ompd_thread_context_t
@@ -359,7 +215,7 @@ typedef ompd_rc_t (*ompd_tmemory_write_fn_t)(
     const void *buffer   /* IN: output buffer */
     );
 
-typedef ompd_rc_t (*ompd_target_host_fn_t)(
+typedef ompd_rc_t (*ompd_callback_device_host_fn_t)(
     ompd_address_space_context_t *address_space_context, /* IN */
     const void *input,                                   /* IN */
     int unit_size,                                       /* IN */
@@ -372,7 +228,7 @@ typedef ompd_rc_t (*ompd_target_host_fn_t)(
  * This is used by the OMPD library to have the debugger print a string.
  * The OMPD should not print directly.
  */
-typedef ompd_rc_t (*ompd_print_string_fn_t)(
+typedef ompd_rc_t (*ompd_callback_print_string_fn_t)(
     const char *str /* IN: message to print */
     );
 
@@ -381,34 +237,32 @@ typedef ompd_rc_t (*ompd_print_string_fn_t)(
  */
 typedef struct ompd_callbacks_t {
   /* Debugger interface */
-  ompd_dmemory_alloc_fn_t dmemory_alloc;
-  ompd_dmemory_free_fn_t dmemory_free;
-  ompd_print_string_fn_t print_string;
+  ompd_callback_memory_alloc_fn_t memory_alloc;
+  ompd_callback_memory_free_fn_t memory_free;
+  ompd_callback_print_string_fn_t print_string;
 
   /* Target interface */
-  ompd_tsizeof_prim_fn_t tsizeof_prim;
-  ompd_tsymbol_addr_fn_t tsymbol_addr;
-  ompd_tmemory_read_fn_t read_tmemory;
-  ompd_tmemory_write_fn_t write_tmemory;
-
-  ompd_target_host_fn_t target_to_host;
-  ompd_target_host_fn_t host_to_target;
+  ompd_callback_sizeof_fn_t sizeof_types;
+  ompd_callback_symbol_addr_fn_t symbol_addr_lookup;
+  ompd_callback_memory_read_fn_t read_memory;
+  ompd_callback_memory_write_fn_t write_memory;
 
-  ompd_get_thread_context_for_thread_id_fn_t get_thread_context_for_thread_id;
-  //  ompd_get_process_context_for_context_fn_t  get_containing_process_context;
+  ompd_callback_device_host_fn_t device_to_host;
+  ompd_callback_device_host_fn_t host_to_device;
 
+  ompd_callback_get_thread_context_for_thread_id_fn_t get_thread_context_for_thread_id;
 } ompd_callbacks_t;
 
 /******************************************************************************
  * Call signatures from the debugger to the OMPD DLL.
  */
 
-/* --- 4 Initialization ----------------------------------------------------- */
+/* --- 4.1 Initialization --------------------------------------------------- */
 
 /**
  * The OMPD function ompd_get_version_string returns a descriptive string
  * describing an implementation of the OMPD library. The function
- * ompd_get_version_compatibility returns an integer code used to indicate the
+ * ompd_get_api_version returns an integer code used to indicate the
  * revision of the OMPD specification supported by an implementation of OMPD.
  */
 
@@ -425,8 +279,13 @@ ompd_get_api_version_string(const char **string /* OUT: OMPD version string */
  * maintain the functions valid for as long as needed.
  */
 ompd_rc_t
-ompd_initialize(const ompd_callbacks_t *table, /* IN: callbacks table */
-                ompd_word_t version);
+ompd_initialize(ompd_word_t version,
+                const ompd_callbacks_t *table /* IN: callbacks table */
+              );
+
+ompd_rc_t ompd_finalize(void);
+
+/* --- 4.2 Per Process Initialization and Finalization ---------------------- */
 
 ompd_rc_t
 ompd_process_initialize(ompd_address_space_context_t
@@ -435,53 +294,34 @@ ompd_process_initialize(ompd_address_space_context_t
                             *addrhandle /* OUT: ompd handle for the target */
                         );
 
-ompd_rc_t
-ompd_get_openmp_version(ompd_address_space_handle_t
-                            *addr_handle, /* IN: handle for the address space */
-                        ompd_word_t *version);
-
-ompd_rc_t ompd_get_openmp_version_string(
-    ompd_address_space_handle_t
-        *addr_handle, /* IN: handle for the address space */
-    const char **string);
+ompd_rc_t ompd_device_initialize(
+    ompd_address_space_handle_t *addr_handle,    /* IN: handle for the address space */
+    ompd_address_space_context_t *device_context,
+    int kind,
+    ompd_size_t sizeof_id,
+    void *id,
+    ompd_address_space_handle_t **device_handle
+    );
 
 ompd_rc_t ompd_release_address_space_handle(
     ompd_address_space_handle_t
         *addr_handle /* IN: handle for the address space */
     );
 
-ompd_rc_t ompd_device_initialize(
-    ompd_address_space_context_t
-        *context,                /* IN: debugger handle for the device */
-    ompd_device_identifier_t id, /* IN: object defined by native device API */
-    ompd_device_kind_t kind,     /* IN: */
-    ompd_address_space_handle_t *
-        *addrhandle /* OUT: ompd handle for the device */
-    );
+/* --- 4.4 Address Space Information ---------------------------------------- */
 
-ompd_rc_t ompd_finalize(void);
-/* --- 4 Handle Management -------------------------------------------------- */
+ompd_rc_t
+ompd_get_omp_version(ompd_address_space_handle_t
+                            *addr_handle, /* IN: handle for the address space */
+                        ompd_word_t *version);
 
-/* --- 4.1 Thread Handles --------------------------------------------------- */
+ompd_rc_t ompd_get_omp_version_string(
+    ompd_address_space_handle_t
+        *addr_handle, /* IN: handle for the address space */
+    const char **string);
+
+/* --- 4.5 Thread Handles --------------------------------------------------- */
 
-/**
- * Retrieve handles for all OpenMP threads.
- *
- * The ompd_get_threads operation enables the debugger to obtain handles for all
- * OpenMP threads. A successful invocation of ompd_get_threads returns a pointer
- * to a vector of handles in thread_handle_array and returns the number of
- * handles in num_handles. This call yields meaningful results only if all
- * OpenMP threads are stopped; otherwise, the OpenMP runtime may be creating
- * and/or destroying threads during or after the call, rendering useless the
- * vector of handles returned.
- */
-#if 0 
-ompd_rc_t ompd_get_threads (
-    ompd_address_space_handle_t *addr_handle,    /* IN: handle for the address space */
-    ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */
-    int                     *num_handles    /* OUT: number of handles in the array */
-    );
-#endif
 /**
  * Retrieve handles for OpenMP threads in a parallel region.
  *
@@ -500,11 +340,21 @@ ompd_rc_t ompd_get_thread_in_parallel(
     ompd_thread_handle_t **thread_handle /* OUT: handle */
     );
 
-#if 0
-ompd_rc_t ompd_get_master_thread_in_parallel (
-    ompd_parallel_handle_t *parallel_handle,    /* IN */
-    ompd_thread_handle_t **thread_handle);
-#endif
+/**
+ * Obtain an OpenMP thread handle and the internal OS thread handle for the
+ * selected (context) thread.
+ * If the function returns ompd_rc_ok then the operating system thread
+ * corresponds to an OpenMP thread and the thread_handle is initialized. The
+ * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime
+ * system.
+ */
+ompd_rc_t ompd_get_thread_handle(
+    ompd_address_space_handle_t
+        *addr_handle, /* IN: handle for the address space */
+    ompd_thread_id_t kind,
+    ompd_size_t sizeof_thread_id, const void *thread_id,
+    ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/
+    );
 
 ompd_rc_t ompd_release_thread_handle(ompd_thread_handle_t *thread_handle);
 
@@ -512,14 +362,16 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1,
                                      ompd_thread_handle_t *thread_handle_2,
                                      int *cmp_value);
 
-#if 0
-ompd_rc_t ompd_get_thread_handle_string_id (
-    ompd_thread_handle_t *thread_handle,
-    char **string_id
-);
-#endif
+/**
+ * Obtain the OS thread handle for an OpenMP thread handle.
+ * this might change over time in case virtual openmp threads migrate between
+ * OS threads.
+ */
+ompd_rc_t ompd_get_thread_id(
+    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
+    ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id);
 
-/* --- 4.2 Parallel Region Handles------------------------------------------- */
+/* --- 4.6 Parallel Region Handles------------------------------------------- */
 
 /**
  * Retrieve the handle for the innermost patallel region for an OpenMP thread.
@@ -572,14 +424,7 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1,
                              ompd_parallel_handle_t *parallel_handle_2,
                              int *cmp_value);
 
-#if 0
-ompd_rc_t ompd_get_parallel_handle_string_id (
-    ompd_parallel_handle_t *parallel_handle,
-    char **string_id
-);
-#endif
-
-/* --- 4.3 Task Handles ----------------------------------------------------- */
+/* --- 4.7 Task Handles ----------------------------------------------------- */
 
 /**
  * Retrieve the handle for the innermost task for an OpenMP thread.
@@ -589,7 +434,7 @@ ompd_rc_t ompd_get_parallel_handle_string_id (
  * for the innermost task region associated with an OpenMP thread. This call is
  * meaningful only if the thread whose handle is provided is stopped.
  */
-ompd_rc_t ompd_get_current_task__handle(
+ompd_rc_t ompd_get_current_task_handle(
     ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
     ompd_task_handle_t **task_handle     /* OUT: OpenMP task handle */
     );
@@ -602,19 +447,13 @@ ompd_rc_t ompd_get_current_task__handle(
  * meaningful only if the thread executing the task specified by task_handle is
  * stopped.
  */
-#if 0
-ompd_rc_t ompd_get_ancestor_task_handle(
-    ompd_task_handle_t *task_handle,         /* IN: OpenMP task handle */
-    ompd_task_handle_t **parent_task_handle  /* OUT: OpenMP task handle */
-    );
-#endif
 
-ompd_rc_t ompd_get_generating_ancestor_task_handle(
+ompd_rc_t ompd_get_generating_task_handle(
     ompd_task_handle_t *task_handle,        /* IN: OpenMP task handle */
     ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */
     );
 
-ompd_rc_t ompd_get_scheduling_ancestor_task_handle(
+ompd_rc_t ompd_get_scheduling_task_handle(
     ompd_task_handle_t *task_handle,        /* IN: OpenMP task handle */
     ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */
     );
@@ -639,212 +478,11 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1,
                                    ompd_task_handle_t *task_handle_2,
                                    int *cmp_value);
 
-#if 0
-ompd_rc_t ompd_get_task_handle_string_id (
-    ompd_task_handle_t *task_handle,
-    char **string_id
-);
-#endif
-
-/* --- 5o Process and Thread Settings ----------------------------------------
- */
-
-/**
- * The functions ompd_get_num_procs and ompd_get_thread_limit are third-party
- * versions of the OpenMP runtime functions omp_get_num_procs and
- * omp_get_thread_limit.
- */
-
-ompd_rc_t
-ompd_get_num_procs(ompd_address_space_handle_t
-                       *addr_handle, /* IN: handle for the address space */
-                   ompd_word_t *val  /* OUT: number of processes */
-                   );
-
-ompd_rc_t
-ompd_get_thread_limit(ompd_address_space_handle_t
-                          *addr_handle, /* IN: handle for the address space */
-                      ompd_word_t *val  /* OUT: max number of threads */
-                      );
-
-/* --- 6 Parallel Region Inqueries ------------------------------------------ */
-/* --- 6.1 Settings --------------------------------------------------------- */
-
-/**
- * Determine the number of threads associated with a parallel region.
- */
-ompd_rc_t ompd_get_num_threads(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_word_t *val                         /* OUT: number of threads */
-    );
-
-/**
- * Determine the nesting depth of a particular parallel region instance.
- */
-ompd_rc_t ompd_get_level(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_word_t *val                         /* OUT: nesting level */
-    );
-
-/**
- * Determine the number of enclosing active parallel regions.
- *
- * ompd_get_active_level returns the number of nested, active parallel regions
- * enclosing the parallel region specified by its handle.
- */
-ompd_rc_t ompd_get_active_level(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_word_t *val                         /* OUT: active nesting level */
-    );
-
-/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */
-
-/**
- * The functions ompd_get_parallel_id and ompd_get_parallel_function are
- * third-party variants of their OMPT counterparts. The only difference between
- * the OMPD and OMPT versions is that the OMPD must supply a parallel region
- * handle to provide a context for these inquiries.
- */
-ompd_rc_t ompd_get_parallel_data(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_address_t *data                     /* OUT: OpenMP parallel id */
-    );
-
-#if 0
-ompd_rc_t ompd_get_parallel_function(
-    ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */
-    ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */
-    );
-#endif
-
-/* --- 7 Thread Inquiry ----------------------------------------------------- */
-/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */
-
-/**
- * Obtain an OpenMP thread handle and the internal OS thread handle for the
- * selected (context) thread.
- * If the function returns ompd_rc_ok then the operating system thread
- * corresponds to an OpenMP thread and the thread_handle is initialized. The
- * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime
- * system.
- */
-ompd_rc_t ompd_get_thread_handle(
-    ompd_address_space_handle_t
-        *addr_handle, /* IN: handle for the address space */
-    ompd_thread_id_kind_t kind,
-    ompd_size_t sizeof_thread_id, const void *thread_id,
-    ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/
-    );
-
-/**
- * Obtain the OS thread handle for an OpenMP thread handle.
- * this might change over time in case virtual openmp threads migrate between
- * OS threads.
- */
-ompd_rc_t ompd_get_thread_id(
-    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
-    ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id);
-
-ompd_rc_t ompd_get_thread_data(
-    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
-    ompd_address_t *data                 /* OUT: OpenMP thread data */
-    );
-
-ompd_rc_t ompd_get_thread_num(
-    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
-    ompd_word_t *val /* OUT: number of the thread within the team */
-    );
-
-/* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */
-
-/**
- * Get the state of a thread. This can use OMPT state data structure to define
- * different states of threads (e.g., idle, working, or barrier, etc) and what
- * entity cased this state (e.g., address of a lock);
- *
- * The function ompd_get_state is a third-party version of ompt_get_state. The
- * only difference between the OMPD and OMPT counterparts is that the OMPD
- * version must supply a thread handle to provide a context for this inquiry.
- */
-ompd_rc_t ompd_get_state(
-    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
-    ompd_word_t *state,                  /* OUT: State of this thread */
-    ompd_wait_id_t *wait_id              /* OUT: Wait ID */
-    );
-
-/* --- 8 Task Inquiry ------------------------------------------------------- */
-
-/* --- 8.1 Task Function Entry Point ---------------------------------------- */
-
-/**
- * The ompd_get_task_function returns the entry point of the code that
- * corresponds to the body of code executed by the task.
- */
 
-#if 0
 ompd_rc_t ompd_get_task_function(
-    ompd_task_handle_t *task_handle,         /* IN: OpenMP task handle*/
-    ompd_address_t *entry_point /* OUT: first instruction in the task region */
-    );
-#endif
-
-/* --- 8.2 Task Settings ---------------------------------------------------- */
-
-/**
- * Retrieve information from OpenMP tasks. These inquiry functions have no
- * counterparts in the OMPT interface as a first-party tool can call OpenMP
- * runtime inquiry functions directly. The only difference between the OMPD
- * inquiry operations and their counterparts in the OpenMP runtime is that the
- * OMPD version must supply a task handle to provide a context for each inquiry.
- */
-
-ompd_rc_t ompd_get_max_threads(
-    ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-    ompd_word_t *val                 /* OUT: max number of threads */
-    );
-
-ompd_rc_t
-ompd_in_parallel(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                 ompd_word_t *val /* OUT: Is OpenMP in parallel? */
-                 );
-
-ompd_rc_t
-ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-              ompd_word_t *val                 /* OUT: Is OpenMP in final? */
-              );
-
-ompd_rc_t
-ompd_get_dynamic(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                 ompd_word_t *val                 /* OUT: ? */
-                 );
-
-ompd_rc_t
-ompd_get_nested(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */
-                ompd_word_t *val                 /* OUT: Is this task nested? */
-                );
-
-ompd_rc_t ompd_get_max_active_levels(
-    ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */
-    ompd_word_t *val                 /* OUT: max active levels */
-    );
-
-ompd_rc_t
-ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                  ompd_word_t *kind,    /* OUT: Kind of OpenMP schedule*/
-                  ompd_word_t *modifier /* OUT: Schedunling modifier */
-                  );
-
-ompd_rc_t
-ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                   ompd_word_t *bind /* OUT: Kind of proc-binding */
-                   );
-
-ompd_rc_t
-ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/
-                 ompd_word_t *val /* OUT: implicit=1, explicit=0 */
-                 );
+    ompd_task_handle_t *task_handle,
+    ompd_address_t *entry_point);
 
-/* --- 8.3 OMPT Task Inquiry Analogues -------------------------------------- */
 
 /**
  * The functions defined here are third-party versions of ompt_get_task_frame
@@ -876,12 +514,31 @@ ompd_rc_t ompd_get_task_frame(
     ompd_address_t *sp_reentry       /* OUT: previous frame is user code */
     );
 
-ompd_rc_t
-ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */
-                   ompd_address_t *task_data        /* OUT: OpenMP task ID */
-                   );
 
-/* --- 13 Display Control Variables ----------------------------------------- */
+/**
+ * Get the state of a thread. This can use OMPT state data structure to define
+ * different states of threads (e.g., idle, working, or barrier, etc) and what
+ * entity cased this state (e.g., address of a lock);
+ *
+   * The function ompd_get_state is a third-party version of ompt_get_state. The
+ * only difference between the OMPD and OMPT counterparts is that the OMPD
+ * version must supply a thread handle to provide a context for this inquiry.
+ */
+ompd_rc_t ompd_enumerate_states (
+    ompd_address_space_handle_t *address_space_handle,
+    ompd_word_t current_state,
+    ompd_word_t *next_state,
+    const char **next_state_name,
+    ompd_word_t *more_enums
+    );
+
+ompd_rc_t ompd_get_state(
+    ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/
+    ompd_word_t *state,                  /* OUT: State of this thread */
+    ompd_wait_id_t *wait_id              /* OUT: Wait ID */
+    );
+
+/* --- 4.8 Display Control Variables ---------------------------------------- */
 
 /**
  * Using the ompd_display_control_vars function, the debugger can extract a
@@ -905,6 +562,21 @@ ompd_rc_t ompd_release_display_control_vars(
     const char *const **control_var_values /* IN */
     );
 
+/* --- 4.9 Internal Control Variables --------------------------------------- */
+
+ompd_rc_t
+ompd_enumerate_icvs(ompd_address_space_handle_t *handle, ompd_icv_id_t current,
+                    ompd_icv_id_t *next_id, const char **next_icv_name,
+                    ompd_scope_t *next_scope, int *more);
+
+ompd_rc_t
+ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, ompd_icv_id_t icv_id,
+                        ompd_word_t *icv_value);
+
+ompd_rc_t
+ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope,
+                               ompd_icv_id_t icv_id, const char **icv_string);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libompd/src/ompd_test.c b/libompd/src/ompd_test.c
index 92609a66a..f54385730 100644
--- a/libompd/src/ompd_test.c
+++ b/libompd/src/ompd_test.c
@@ -59,7 +59,7 @@ void test_CB_tsizeof_prim() {
   test_print_header();
 
   ompd_rc_t ret;
-  ompd_target_type_sizes_t sizes;
+  ompd_device_type_sizes_t sizes;
   ret = callbacks->tsizeof_prim((ompd_context_t *)1, &sizes);
   if (ret == ompd_rc_ok) {
     printf("%-20s %du\n", "Size of char:", sizes.sizeof_char);
diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt
index 3d9c78a72..8b721c0e7 100644
--- a/libomptarget/CMakeLists.txt
+++ b/libomptarget/CMakeLists.txt
@@ -53,6 +53,12 @@ if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug)
   add_definitions(-O0)
 endif()
 
+# OMPD support for libomptarget (currently only with cuda)
+set(LIBOMPTARGET_OMPD_SUPPORT FALSE CACHE BOOL "OMPD-support?")
+if (LIBOMPTARGET_OMPD_SUPPORT)
+  add_definitions(-DOMPD_SUPPORT=1)
+endif()
+
 include_directories(include)
 
 # Build target agnostic offloading library.
diff --git a/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
new file mode 100644
index 000000000..5c6934011
--- /dev/null
+++ b/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -0,0 +1,112 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#//                     The LLVM Compiler Infrastructure
+#//
+#// This file is dual licensed under the MIT and the University of Illinois Open
+#// Source Licenses. See LICENSE.txt for details.
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# We use the compiler and linker provided by the user, attempt to use the one
+# used to build libomptarget or just fail.
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE)
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+else()
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS "${llvm_link}")
+  # Use llvm-link from the compiler directory.
+  set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}")
+else()
+  return()
+endif()
+
+function(try_compile_bitcode output source)
+  set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu)
+  file(WRITE ${srcfile} "${source}\n")
+  set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc)
+
+  # The remaining arguments are the flags to be tested.
+  # FIXME: Don't hardcode GPU version. This is currently required because
+  #        Clang refuses to compile its default of sm_20 with CUDA 9.
+  execute_process(
+    COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN}
+      --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile}
+    RESULT_VARIABLE result
+    OUTPUT_QUIET ERROR_QUIET)
+  if (result EQUAL 0)
+    set(${output} TRUE PARENT_SCOPE)
+  else()
+    set(${output} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Save for which compiler we are going to do the following checks so that we
+# can discard cached values if the user specifies a different value.
+set(discard_cached FALSE)
+if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND
+    NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}"))
+  set(discard_cached TRUE)
+endif()
+set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE)
+
+function(check_bitcode_compilation output source)
+  if (${discard_cached} OR NOT DEFINED ${output})
+    message(STATUS "Performing Test ${output}")
+    # Forward additional arguments which contain the flags.
+    try_compile_bitcode(result "${source}" ${ARGN})
+    set(${output} ${result} CACHE INTERNAL "" FORCE)
+    if(${result})
+      message(STATUS "Performing Test ${output} - Success")
+    else()
+      message(STATUS "Performing Test ${output} - Failed")
+    endif()
+  endif()
+endfunction()
+
+# These flags are required to emit LLVM Bitcode. We check them together because
+# if any of them are not supported, there is no point in finding out which are.
+set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
+
+# It makes no sense to continue given that the compiler doesn't support
+# emitting basic LLVM Bitcode
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+  return()
+endif()
+
+set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required})
+
+# Declaring external shared device variables might need an additional flag
+# since Clang 7.0 and was entirely unsupported since version 4.0.
+set(extern_device_shared_src "extern __device__ __shared__ int test;")
+
+check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS})
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED)
+  set(compiler_flag_fcuda_rdc -fcuda-rdc)
+  set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc})
+  check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
+
+  if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+    return()
+  endif()
+
+  set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}")
+endif()
+
+# We can compile LLVM Bitcode from CUDA source code!
+set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE)
diff --git a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
index d9a76c2c6..b0fdc5b4d 100644
--- a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -54,6 +54,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
       src/reduction.cu
       src/sync.cu
       src/task.cu
+      src/ompd-specific.cu
   )
 
   set(omp_data_objects src/omp_data.cu)
@@ -89,126 +90,95 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
       OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
 
   # Install device RTL under the lib destination folder.
-  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "lib")
+  install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 
   target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
 
+
   # Check if we can create an LLVM bitcode implementation of the runtime library
-  # that could be inlined in the user implementation.
-  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL
+  # that could be inlined in the user application. For that we need to find
+  # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+  # an LLVM linker.
+  set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+    "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+  set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+    "Location of a linker capable of linking LLVM bitcode objects.")
+
+  include(LibomptargetNVPTXBitcodeLibrary)
+
+  set(bclib_default FALSE)
+  if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+    set(bclib_default TRUE)
+  endif()
+  set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL
     "Enable CUDA LLVM bitcode offloading device RTL.")
   if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
+    if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED})
+      libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!")
+    endif()
+    libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
 
-    # Find a clang compiler capable of compiling cuda files to LLVM bitcode and
-    # an LLVM linker.
-    # We use the one provided by the user, attempt to use the one used to build
-    # libomptarget or just fail.
-
-    set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
-      "Location of a CUDA compiler capable of emitting LLVM bitcode.")
-    set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
-      "Location of a linker capable of linking LLVM bitcode objects.")
-
-    if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
-      set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
-    elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
-      set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
+    # Set flags for LLVM Bitcode compilation.
+    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} -DOMPTARGET_NVPTX_TEST=0)
+    if(${LIBOMPTARGET_NVPTX_DEBUG})
+      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
     else()
-      libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.")
-      libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER")
+      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
     endif()
 
-    # Get compiler directory to try to locate a suitable linker
-    get_filename_component(COMPILER_DIR ${CMAKE_C_COMPILER} DIRECTORY)
-    
-    if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
-      set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
-    elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND EXISTS "${COMPILER_DIR}/llvm-link")
-      # Use llvm-link from the directory containing clang
-      set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link)
-    else()
-      libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.")
-      libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER")
+    if(${LIBOMPTARGET_OMPD_SUPPORT})
+      set(bc_flags ${bc_flags} -DOMPD_SUPPORT=1)
     endif()
 
-    if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER)
-      libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
-
-      # Decide which ptx version to use. Same choices as Clang.
-      if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9)
-        set(CUDA_PTX_VERSION ptx60)
-      else()
-        set(CUDA_PTX_VERSION ptx42)
-      endif()
- 
-      set(BC_DEBUG -DOMPTARGET_NVPTX_DEBUG=0)
-      if(${LIBOMPTARGET_NVPTX_DEBUG})
-        set(BC_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1)
-      endif()
-
-      # Set flags for Clang cuda compilation.  Only Clang is supported because there is
-      # no other compiler capable of generating bitcode from cuda sources.
-      set(CUDA_FLAGS
-          -emit-llvm
-          -O1
-          -Xclang -target-feature
-          -Xclang +${CUDA_PTX_VERSION}
-          --cuda-device-only
-          -DOMPTARGET_NVPTX_TEST=0
-          ${BC_DEBUG}
-      )
+    # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
+    # to handle. Therefore, we use 'weak' instead. We are compiling only for the
+    # device, so it should be equivalent.
+    if(CUDA_VERSION_MAJOR GREATER 8)
+      set(bc_flags ${bc_flags} -Dnv_weak=weak)
+    endif()
 
-      # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
-      # to handle. Therefore, we use 'weak' instead. We are compiling only for the
-      # device, so it should be equivalent.
-      if(CUDA_VERSION_MAJOR EQUAL 9)
-        set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak)
-      endif()
-
-      # Get the compute capability the user requested or use SM_35 by default.
-      set(CUDA_ARCH "")
-      foreach(sm ${nvptx_sm_list})
-        set(CUDA_ARCH --cuda-gpu-arch=sm_${sm})
-
-        # Compile cuda files to bitcode.
-        set(bc_files "")
-        foreach(src ${cuda_src_files})
-          get_filename_component(infile ${src} ABSOLUTE)
-          get_filename_component(outfile ${src} NAME)
-
-          add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES}
-              -c ${infile} -o ${outfile}-sm_${sm}.bc
-            DEPENDS ${infile}
-            IMPLICIT_DEPENDS CXX ${infile}
-            COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
-            VERBATIM
-          )
-          set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
-
-          list(APPEND bc_files ${outfile}-sm_${sm}.bc)
-        endforeach()
-
-        # Link to a bitcode library.
-        add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-              -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
-            DEPENDS ${bc_files}
-            COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
+    # Generate a Bitcode library for all the compute capabilities the user requested.
+    foreach(sm ${nvptx_sm_list})
+      set(cuda_arch --cuda-gpu-arch=sm_${sm})
+
+      # Compile CUDA files to bitcode.
+      set(bc_files "")
+      foreach(src ${cuda_src_files})
+        get_filename_component(infile ${src} ABSOLUTE)
+        get_filename_component(outfile ${src} NAME)
+
+        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch}
+            -c ${infile} -o ${outfile}-sm_${sm}.bc
+          DEPENDS ${infile}
+          IMPLICIT_DEPENDS CXX ${infile}
+          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
+          VERBATIM
         )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
 
-        add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
+        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
+      endforeach()
 
-        # Copy library to destination.
-        add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
-                           COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                           $<TARGET_FILE_DIR:omptarget-nvptx>)
+      # Link to a bitcode library.
+      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
+          DEPENDS ${bc_files}
+          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
+      )
+      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
 
-        # Install device RTL under the lib destination folder.
-        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "lib")
-      endforeach()
-    endif()
+      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
+
+      # Copy library to destination.
+      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
+                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
+                         $<TARGET_FILE_DIR:omptarget-nvptx>)
+
+      # Install bitcode library under the lib destination folder.
+      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+    endforeach()
   endif()
 
 else()
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index 84f6ec608..a02d962f6 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -516,4 +516,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
 EXTERN void *
 __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
                                           int16_t IsOMPRuntimeInitialized);
+
+// SPMD execution mode interrogation function.
+EXTERN int8_t __kmpc_is_spmd_exec_mode();
 #endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index e76475238..0cb15f095 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
-
+#ifdef OMPD_SUPPORT
+  #include "ompd-specific.h"
+#endif /*OMPD_SUPPORT*/
 ////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////
 // template class that encapsulate all the helper functions
@@ -86,7 +88,7 @@ public:
 
     T inputUb = ub;
     ub = lb + chunk - 1; // Clang uses i <= ub
-    last = ub == inputUb;
+    last = lb <= inputUb && inputUb <= ub;
     stride = loopSize; // make sure we only do 1 chunk per warp
   }
 
@@ -96,8 +98,8 @@ public:
   INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
                                      T *plower, T *pupper, ST *pstride,
                                      ST chunk, bool IsSPMDExecutionMode,
-                                     bool IsOMPRuntimeUnavailable = false) {
-    // When IsOMPRuntimeUnavailable is true, we assume that the caller is
+                                     bool IsRuntimeUninitialized) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
     // in an L0 parallel region and that all worker threads participate.
 
     int tid = GetLogicalThreadIdInBlock();
@@ -105,23 +107,23 @@ public:
     // Assume we are in teams region or that we use a single block
     // per target region
     ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
-        tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+        tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
 
     // All warps that are in excess of the maximum requested, do
     // not execute the loop
     PRINT(LD_LOOP,
           "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
           "%d, num tids %d\n",
-          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
           schedtype, P64(chunk),
-          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized),
           GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                IsOMPRuntimeUnavailable));
+                                IsRuntimeUninitialized));
     ASSERT0(
         LT_FUSSY,
-        (GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable)) <
+        (GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) <
             (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                   IsOMPRuntimeUnavailable)),
+                                   IsRuntimeUninitialized)),
         "current thread is not needed here; error");
 
     // copy
@@ -135,9 +137,9 @@ public:
     case kmp_sched_static_chunk: {
       if (chunk > 0) {
         entityId =
-            GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+            GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
         numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                                 IsOMPRuntimeUnavailable);
+                                                 IsRuntimeUninitialized);
         ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                        numberOfEntities);
         break;
@@ -145,9 +147,9 @@ public:
     } // note: if chunk <=0, use nochunk
     case kmp_sched_static_nochunk: {
       entityId =
-          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
       numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                               IsOMPRuntimeUnavailable);
+                                               IsRuntimeUninitialized);
       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
                        numberOfEntities);
       break;
@@ -172,12 +174,12 @@ public:
     case kmp_sched_distr_static_chunk_sched_static_chunkone: {
       entityId =
           GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                IsOMPRuntimeUnavailable) *
+                                IsRuntimeUninitialized) *
               GetOmpTeamId() +
-          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
       numberOfEntities = GetNumberOfOmpTeams() *
                          GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                               IsOMPRuntimeUnavailable);
+                                               IsRuntimeUninitialized);
       ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                      numberOfEntities);
       break;
@@ -187,9 +189,9 @@ public:
       PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
             schedtype);
       entityId =
-          GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
+          GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized);
       numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                               IsOMPRuntimeUnavailable);
+                                               IsRuntimeUninitialized);
       ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
                      numberOfEntities);
     }
@@ -202,9 +204,12 @@ public:
     PRINT(LD_LOOP,
           "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
           GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
-                                IsOMPRuntimeUnavailable),
+                                IsRuntimeUninitialized),
           GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
           P64(*pstride));
+#ifdef OMPD_SUPPORT
+    ompd_set_device_thread_state(omp_state_work_parallel);
+#endif /*OMPD_SUPPORT*/
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -215,7 +220,8 @@ public:
            schedule <= kmp_sched_ordered_last;
   }
 
-  INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
+  INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId,
+                                   kmp_sched_t schedule, T lb, T ub, ST st,
                                    ST chunk) {
     int tid = GetLogicalThreadIdInBlock();
     omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
@@ -239,12 +245,17 @@ public:
 
     // Process schedule.
     if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule)) {
+        if (isSPMDMode())
+          __syncthreads();
+        else
+          __kmpc_barrier(loc, threadId);
+      }
       PRINT(LD_LOOP,
             "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
             (long)tnum, P64(tripCount), schedule);
       schedule = kmp_sched_static_chunk;
       chunk = tripCount; // one thread gets the whole loop
-
     } else if (schedule == kmp_sched_runtime) {
       // process runtime
       omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
@@ -282,18 +293,15 @@ public:
              "unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
     }
 
-    // save sched state
-    omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-    omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
-
     // init schedules
     if (schedule == kmp_sched_static_chunk) {
       ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
       int lastiter = 0;
       ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
@@ -301,8 +309,8 @@ public:
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
-            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
@@ -310,11 +318,12 @@ public:
 
     } else if (schedule == kmp_sched_static_nochunk) {
       ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
       // save ub
       omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
       // compute static chunk
       ST stride;
-      T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
       int lastiter = 0;
       ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
       // save computed params
@@ -322,45 +331,53 @@ public:
       omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
       omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
       PRINT(LD_LOOP,
-            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 ","
-            "next lower bound = %llu, stride = %llu\n",
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
             omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
             omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
             omptarget_nvptx_threadPrivateContext->Stride(tid));
 
     } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      if (chunk < 1)
-        chunk = 1;
-      Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
-      // but each thread (but one) must discover that it is last
-      eventNum += tnum;
-      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
-      omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
+      // save sched state
+      int teamId = GetOmpTeamId();
+      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
+      if (GetThreadIdInBlock() == 0) {
+        if (chunk < 1)
+          chunk = 1;
+        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      }
+      if (isSPMDMode())
+        __syncthreads();
+      else
+        __kmpc_barrier(loc, threadId);
       PRINT(LD_LOOP,
-            "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", "
-            "events number = %llu\n",
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
             GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
-            omptarget_nvptx_threadPrivateContext->Chunk(tid),
-            omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
+            omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
+            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
     }
+#ifdef OMPD_SUPPORT
+    ompd_set_device_thread_state(omp_state_work_parallel);
+#endif /*OMPD_SUPPORT*/
   }
 
   ////////////////////////////////////////////////////////////////////////////////
   // Support for dispatch next
 
-  INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
-                                     Counter priv, T &lb, T &ub,
-                                     Counter &chunkId, Counter &currentEvent,
-                                     T chunkSize, T loopUpperBound) {
-    // get next event atomically
-    Counter nextEvent = cg.Next();
-    // calculate chunk Id (priv was initialized upon entering the loop to
-    // 'start' == 'event')
-    chunkId = nextEvent - priv;
+  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
+                                     Counter &loopLowerBound,
+                                     T loopUpperBound) {
     // calculate lower bound for all lanes in the warp
-    lb = chunkId * chunkSize; // this code assume normalization of LB
+    lb = atomicAdd(&loopLowerBound, (Counter)chunkSize);
     ub = lb + chunkSize - 1;  // Clang uses i <= ub
 
     // 3 result cases:
@@ -368,9 +385,8 @@ public:
     //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
     //  NOT_FINISHED
     //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
-    currentEvent = nextEvent;
     // a.
-    if (ub <= loopUpperBound) {
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
       PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
             P64(ub), P64(loopUpperBound));
       return NOT_FINISHED;
@@ -383,7 +399,8 @@ public:
       return LAST_CHUNK;
     }
     // c. if we are here, we are in case 'c'
-    lb = loopUpperBound + 1;
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
     PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
           P64(ub), P64(loopUpperBound));
     return FINISHED;
@@ -437,29 +454,18 @@ public:
     ASSERT0(LT_FUSSY,
             schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
             "bad sched");
-    omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
     T myLb, myUb;
-    Counter chunkId;
-    // xxx current event is now local
-    omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
+    int teamId = GetOmpTeamId();
     int finished = DynamicNextChunk(
-        cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
-        chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
-        omptarget_nvptx_threadPrivateContext->Chunk(tid),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
-
-    if (finished == FINISHED) {
-      cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                  omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
-      cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
-                 omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
 
+    if (finished == FINISHED)
       return DISPATCH_FINISHED;
-    }
 
     // not finished (either not finished or last chunk)
-    *plast = (int32_t)(
-        myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
+    *plast = (int32_t)(finished == LAST_CHUNK);
     *plower = myLb;
     *pupper = myUb;
     *pstride = 1;
@@ -474,6 +480,9 @@ public:
 
   INLINE static void dispatch_fini() {
     // nothing
+#ifdef OMP_SUPPORT
+  ompd_reset_device_thread_state()
+#endif
   }
 
   ////////////////////////////////////////////////////////////////////////////////
@@ -491,7 +500,7 @@ EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
                                    int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
@@ -499,7 +508,7 @@ EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
                                     int32_t st, int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
@@ -507,7 +516,7 @@ EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
                                    int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
@@ -515,7 +524,7 @@ EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
                                     int64_t st, int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
-      (kmp_sched_t)schedule, lb, ub, st, chunk);
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
 }
 
 // next
@@ -581,7 +590,8 @@ EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
                                      int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
-      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+      isRuntimeUninitialized());
 }
 
 EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
@@ -591,7 +601,8 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
                                       int32_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
-      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+      isRuntimeUninitialized());
 }
 
 EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
@@ -601,7 +612,8 @@ EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
                                      int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
-      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+      isRuntimeUninitialized());
 }
 
 EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
@@ -611,7 +623,8 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
                                       int64_t chunk) {
   PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
-      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
+      schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(),
+      isRuntimeUninitialized());
 }
 
 EXTERN
@@ -623,8 +636,8 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/true,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/true,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -636,8 +649,8 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/true,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/true,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -649,8 +662,8 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/true,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/true,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -662,8 +675,8 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/true,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/true,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -674,8 +687,8 @@ void __kmpc_for_static_init_4_simple_generic(
   PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
   omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/false,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/false,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -686,8 +699,8 @@ void __kmpc_for_static_init_4u_simple_generic(
   PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
   omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/false,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/false,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -698,8 +711,8 @@ void __kmpc_for_static_init_8_simple_generic(
   PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
   omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/false,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/false,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN
@@ -710,11 +723,14 @@ void __kmpc_for_static_init_8u_simple_generic(
   PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
   omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
       schedtype, plastiter, plower, pupper, pstride, chunk,
-      /*isSPMDExecutionMode=*/false,
-      /*IsOMPRuntimeUnavailable=*/true);
+      /*IsSPMDExecutionMode=*/false,
+      /*IsRuntimeUninitialized=*/true);
 }
 
 EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) {
+#ifdef OMP_SUPPORT
+  ompd_reset_device_thread_state()
+#endif
   PRINT0(LD_IO, "call kmpc_for_static_fini\n");
 }
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu
new file mode 100644
index 000000000..3cc18b908
--- /dev/null
+++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu
@@ -0,0 +1,107 @@
+#ifdef OMPD_SUPPORT
+#include "ompd-specific.h"
+#include "omptarget-nvptx.h"
+/**
+   * Declaration of symbols to hold struct size and member offset information
+    */
+
+__device__ __shared__ static int ompd_target_initialized;
+
+#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m##_;
+OMPD_FOREACH_ACCESS(ompd_target_declare_access)
+#undef ompd_target_declare_access
+
+#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m##_;
+    OMPD_FOREACH_ACCESS(ompd_target_declare_sizeof_member)
+#undef ompd_target_declare_sizeof_member
+
+#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t##_;
+    OMPD_FOREACH_SIZEOF(ompd_target_declare_sizeof)
+#undef ompd_target_declare_sizeof
+
+__device__ __shared__
+  uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam_;
+
+__device__ __shared__
+  uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam_;
+
+
+__device__ void ompd_init ( void )
+{
+  if (ompd_target_initialized)
+    return;
+
+#define ompd_target_init_access(t,m) ompd_access__##t##__##m##_ = (uint64_t)&(((t*)0)->m);
+  OMPD_FOREACH_ACCESS(ompd_target_init_access)
+#undef ompd_target_init_access
+
+  ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam_ =
+          (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam);
+
+#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m##_ = sizeof(((t*)0)->m);
+  OMPD_FOREACH_ACCESS(ompd_target_init_sizeof_member)
+#undef ompd_target_init_sizeof_member
+
+  ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam_ =
+    (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam);
+
+#define ompd_target_init_sizeof(t) ompd_sizeof__##t##_ = sizeof(t);
+  OMPD_FOREACH_SIZEOF(ompd_target_init_sizeof)
+#undef ompd_target_init_sizeof
+
+  omptarget_nvptx_threadPrivateContext->ompd_levelZeroParallelInfo.level = 0;
+  if (isSPMDMode()) {
+    omptarget_nvptx_threadPrivateContext->teamContext.levelZeroTaskDescr
+        .ompd_thread_info.enclosed_parallel.parallel_tasks =
+            &omptarget_nvptx_threadPrivateContext->levelOneTaskDescr[0];
+  } else {
+    // generic mode
+    omptarget_nvptx_threadPrivateContext->ompd_levelZeroParallelInfo
+        .parallel_tasks = &omptarget_nvptx_threadPrivateContext->teamContext
+            .levelZeroTaskDescr;
+  }
+
+  ompd_target_initialized = 1;
+}
+
+INLINE void ompd_init_thread(omptarget_nvptx_TaskDescr *currTaskDescr,
+                             void *task_func, uint8_t implicit) {
+  currTaskDescr->ompd_thread_info.blockIdx_x = blockIdx.x;
+  currTaskDescr->ompd_thread_info.threadIdx_x = threadIdx.x;
+  currTaskDescr->ompd_thread_info.task_function = task_func;
+  currTaskDescr->ompd_thread_info.task_implicit = implicit;
+}
+
+__device__ void ompd_set_device_specific_thread_state(
+    omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state) {
+    taskDescr->ompd_thread_info.state = state;
+}
+
+__device__ void  ompd_set_device_thread_state(omp_state_t state) {
+  ompd_set_device_specific_thread_state(getMyTopTaskDescriptor(), state);
+}
+
+__device__ void ompd_init_thread_parallel() {
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  ompd_init_thread(currTaskDescr, omptarget_nvptx_workFn, 1);
+  ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_parallel);
+}
+
+__device__ void ompd_init_thread_master() {
+  omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+  ompd_init_thread(currTaskDescr, NULL, 1);
+  ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_serial);
+}
+
+__device__ void ompd_init_explicit_task(void *task_func) {
+    omptarget_nvptx_TaskDescr *taskDescr = getMyTopTaskDescriptor();
+    ompd_init_thread(taskDescr, task_func, 0);
+}
+
+__device__ void ompd_bp_parallel_begin (){ asm (""); }
+__device__ void ompd_bp_parallel_end (){ asm (""); }
+__device__ void ompd_bp_task_begin (){ asm (""); }
+__device__ void ompd_bp_task_end (){ asm (""); }
+__device__ void ompd_bp_thread_begin (){ asm (""); }
+__device__ void ompd_bp_thread_end (){ asm (""); }
+#endif /* OMPD_SUPPORT */
diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h
new file mode 100644
index 000000000..8b929e5fe
--- /dev/null
+++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h
@@ -0,0 +1,94 @@
+#ifndef __OMPD_SPECIFIC_H__
+#define __OMPD_SPECIFIC_H__
+
+#ifdef OMPD_SUPPORT
+
+#include "state-queue.h"
+#include "option.h"
+#include <stdint.h>
+
+
+
+__device__ void ompd_init( void );
+extern "C" __device__ void ompd_bp_parallel_begin ( void );
+extern "C" __device__ void ompd_bp_parallel_end ( void );
+extern "C" __device__ void ompd_bp_task_begin ( void );
+extern "C" __device__ void ompd_bp_task_end ( void );
+extern "C" __device__ void ompd_bp_thread_begin ( void );
+extern "C" __device__ void ompd_bp_thread_end ( void );
+
+
+#define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \
+  OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext, topTaskDescr) \
+  OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,teamContext) \
+  OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,ompd_levelZeroParallelInfo) \
+  OMPD_ACCESS(omptarget_nvptx_TaskDescr,ompd_thread_info) \
+  OMPD_ACCESS(omptarget_nvptx_TaskDescr,prev) \
+  OMPD_ACCESS(omptarget_nvptx_TeamDescr,levelZeroTaskDescr) \
+  OMPD_ACCESS(ompd_nvptx_thread_info_t,state) \
+  OMPD_ACCESS(ompd_nvptx_thread_info_t,threadIdx_x) \
+  OMPD_ACCESS(ompd_nvptx_thread_info_t,enclosed_parallel)  \
+  OMPD_ACCESS(ompd_nvptx_thread_info_t,task_function) \
+  OMPD_ACCESS(ompd_nvptx_thread_info_t,task_implicit) \
+  OMPD_ACCESS(ompd_nvptx_parallel_info_t,level) \
+  OMPD_ACCESS(ompd_nvptx_parallel_info_t,parallel_tasks)
+
+
+#define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \
+  OMPD_SIZEOF(omptarget_nvptx_ThreadPrivateContext)\
+  OMPD_SIZEOF(omptarget_nvptx_TaskDescr) \
+  OMPD_SIZEOF(ompd_nvptx_thread_info_t)
+
+
+/* we only support work states for the moment */
+typedef enum {
+  omp_state_undefined      = 0x102,
+  omp_state_work_serial    = 0x000,
+  omp_state_work_parallel  = 0x001,
+  omp_state_work_reduction = 0x002
+} omp_state_t;
+
+class omptarget_nvptx_TaskDescr;
+
+__device__ void ompd_init_thread_master();
+__device__ void ompd_set_device_specific_thread_state(
+    omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state);
+__device__ void ompd_set_device_thread_state(omp_state_t state);
+__device__ void ompd_init_thread_parallel();
+__device__ void ompd_init_explicit_task(void *task_func);
+
+INLINE void ompd_reset_device_thread_state() {
+  ompd_set_device_thread_state(omp_state_work_serial);
+}
+
+/* We store parallel info in the threadPrivateContext the same way that task
+ * descriptors are stored. Currently there is no support for nested
+ * parallelism (TODO: there will probably be in the future), so we store one
+ * parallel descriptor in the threadPrivateContext for the outermost parallel
+ * region and additonally one descriptor in each thread in case of serialized
+ * inner parallel regions
+ */
+typedef struct {
+  uint16_t level;
+  /* If level = 0, parallel_tasks points just to the master task descriptor
+   * if level = 1, parallel_tasks points to threadPrivateContext->levelOneTaskDescr
+   * if level > 1, we are in a serialized parallel region and parallel_tasks points
+   * to the single task in the parallel region.
+   */
+  omptarget_nvptx_TaskDescr *parallel_tasks;
+} ompd_nvptx_parallel_info_t;
+
+typedef struct {
+  uint64_t state; // In the host runtime we use the OMPT state.
+                  // Here we need to have our own place to store it.
+  uint16_t blockIdx_x; // Libomptarget should only schedule task in one dimension.
+                  // To store a unique identifier for the current thread, we
+                  // simply store ThreadIdx.x and BlockIdx.x
+  uint16_t threadIdx_x;
+  ompd_nvptx_parallel_info_t enclosed_parallel;
+  void *task_function;
+  uint8_t task_implicit;
+} ompd_nvptx_thread_info_t;
+
+#endif /* OMPD_SUPPORT */
+#endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index 4a8610403..f3202a2bb 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -92,6 +92,11 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
       omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
   currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
   currTaskDescr->ThreadLimit() = ThreadLimit;
+#ifdef OMPD_SUPPORT
+  ompd_init();
+  ompd_init_thread_master();
+  ompd_bp_thread_begin();
+#endif /*OMPD_SUPPORT*/
 }
 
 EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
@@ -105,6 +110,9 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
     omptarget_nvptx_device_State[slot].Enqueue(
         omptarget_nvptx_threadPrivateContext);
   }
+#ifdef OMPD_SUPPORT
+  ompd_bp_thread_end();
+#endif
   // Done with work.  Kill the workers.
   omptarget_nvptx_workFn = 0;
 }
@@ -138,6 +146,11 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
     currTeamDescr.InitTeamDescr();
     // init counters (copy start to init)
     workDescr.CounterGroup().Reset();
+#ifdef OMPD_SUPPORT
+    ompd_init();
+    ompd_bp_parallel_begin(); // This should be placed later, but the parallel
+                              // handle is ready from here on.
+#endif /*OMPD_SUPPORT*/
   }
   __syncthreads();
 
@@ -173,17 +186,33 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
     DataSharingState.SlotPtr[WID] = RootS;
     DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
   }
+#ifdef OMPD_SUPPORT
+  ompd_init_thread_parallel(); // __kmpc_kernel_parallel() is not called in
+                               // spmd mode
+  ompd_bp_thread_begin();
+#endif
 }
 
 EXTERN void __kmpc_spmd_kernel_deinit() {
   // We're not going to pop the task descr stack of each thread since
   // there are no more parallel regions in SPMD mode.
   __syncthreads();
+#ifdef OMPD_SUPPORT
+  ompd_bp_thread_end();
+#endif
   int threadId = GetThreadIdInBlock();
   if (threadId == 0) {
+#ifdef OMPD_SUPPORT
+    ompd_bp_parallel_end();
+#endif
     // Enqueue omp state object for use by another team.
     int slot = smid() % MAX_SM;
     omptarget_nvptx_device_State[slot].Enqueue(
         omptarget_nvptx_threadPrivateContext);
   }
 }
+
+// Return true if the current target region is executed in SPMD mode.
+EXTERN int8_t __kmpc_is_spmd_exec_mode() {
+  return isSPMDMode();
+}
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 2bc5819e6..88daa79d4 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -32,6 +32,7 @@
 #include "option.h"    // choices we have
 #include "state-queue.h"
 #include "support.h"
+#include "ompd-specific.h"
 
 #define OMPTARGET_NVPTX_VERSION 1.1
 
@@ -53,13 +54,13 @@
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
-  __shfl_down_sync((mask), (var), (delta), (width))
+    __shfl_down_sync((mask), (var), (delta), (width))
 #define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
 #define __ACTIVEMASK() __activemask()
 #else
 #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
 #define __SHFL_DOWN_SYNC(mask, var, delta, width)                              \
-  __shfl_down((var), (delta), (width))
+    __shfl_down((var), (delta), (width))
 #define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
 #define __ACTIVEMASK() __ballot(1)
 #endif
@@ -150,6 +151,14 @@ extern __device__ __shared__ DataSharingStateTy DataSharingState;
 // task ICV and (implicit & explicit) task state
 
 class omptarget_nvptx_TaskDescr {
+#if OMPD_SUPPORT
+  friend void __device__ ompd_init( void );
+  friend INLINE void ompd_init_thread(
+      omptarget_nvptx_TaskDescr *currTaskDescr, void *task_func,
+      uint8_t implicit);
+  friend __device__ void  ompd_set_device_specific_thread_state(
+      omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state);
+#endif /* OMPD_SUPPORT */
 public:
   // methods for flags
   INLINE omp_sched_t GetRuntimeSched();
@@ -192,6 +201,11 @@ class omptarget_nvptx_TaskDescr {
   INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
   INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
                                    uint16_t tid, uint16_t tnum);
+#ifdef OMPD_SUPPORT
+  INLINE ompd_nvptx_thread_info_t *ompd_ThreadInfo() {
+    return &ompd_thread_info;
+  }
+#endif
 
 private:
   // bits for flags: (7 used, 1 free)
@@ -216,6 +230,9 @@ class omptarget_nvptx_TaskDescr {
     uint16_t threadsInTeam;    // threads in current team
     uint64_t runtimeChunkSize; // runtime chunk size
   } items;
+#ifdef OMPD_SUPPORT
+  ompd_nvptx_thread_info_t ompd_thread_info;
+#endif
   omptarget_nvptx_TaskDescr *prev;
 };
 
@@ -247,6 +264,9 @@ class omptarget_nvptx_WorkDescr {
 ////////////////////////////////////////////////////////////////////////////////
 
 class omptarget_nvptx_TeamDescr {
+#ifdef OMPD_SUPPORT
+  friend void __device__ ompd_init( void );
+#endif /*OMPD_SUPPORT*/
 public:
   // access to data
   INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
@@ -310,6 +330,9 @@ class omptarget_nvptx_TeamDescr {
 // tid refers here to the global thread id
 // do not support multiple concurrent kernel a this time
 class omptarget_nvptx_ThreadPrivateContext {
+#if OMPD_SUPPORT
+  friend void __device__ ompd_init( void );
+#endif /* OMPD_SUPPORT */
 public:
   // task
   INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
@@ -377,6 +400,10 @@ class omptarget_nvptx_ThreadPrivateContext {
   Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM];
   // Queue to which this object must be returned.
   uint64_t SourceQueue;
+#ifdef OMPD_SUPPORT
+  // The implicit parallel region around the master task in generic mode
+  ompd_nvptx_parallel_info_t ompd_levelZeroParallelInfo;
+#endif
 };
 
 /// Device envrionment data
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index d4546284f..0446d7170 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -34,6 +34,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
+#ifdef OMPD_SUPPORT
+  #include "ompd-specific.h"
+#endif /*OMPD_SUPPORT*/
 
 typedef struct ConvergentSimdJob {
   omptarget_nvptx_TaskDescr taskDescr;
@@ -301,6 +304,20 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
   ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
           "only team master can create parallel");
 
+#ifdef OMPD_SUPPORT
+  // Set ompd info for first level parallel region (this info is stored in the
+  // master threads task info, so it can easily be accessed
+  ompd_nvptx_parallel_info_t &nextPar = currTaskDescr->ompd_ThreadInfo()
+                                                     ->enclosed_parallel;
+  nextPar.level = 1;
+  nextPar.parallel_tasks =
+      omptarget_nvptx_threadPrivateContext->Level1TaskDescr(0);
+  // Move the previous thread into undefined state (will be reset in __kmpc_kernel_end_parallel)
+  // TODO (mr) find a better place to do this
+  ompd_set_device_thread_state(omp_state_undefined);
+  ompd_bp_parallel_begin();
+#endif /*OMPD_SUPPORT*/
+
   // set number of threads on work descriptor
   // this is different from the number of cuda threads required for the parallel
   // region
@@ -355,6 +372,10 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
           newTaskDescr->ThreadId(), newTaskDescr->NThreads());
 
     isActive = true;
+#ifdef OMPD_SUPPORT
+    ompd_init_thread_parallel();
+    ompd_bp_thread_begin();
+#endif /*OMPD_SUPPORT*/
   }
 
   return isActive;
@@ -369,6 +390,13 @@ EXTERN void __kmpc_kernel_end_parallel() {
   omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
+#ifdef OMPD_SUPPORT
+  ompd_reset_device_thread_state();
+  ompd_bp_thread_end();
+  if (threadId == 0) {
+    ompd_bp_parallel_end();
+  }
+#endif /*OMPD_SUPPORT*/
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -400,9 +428,26 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
   newTaskDescr->ThreadId() = 0;
   newTaskDescr->ThreadsInTeam() = 1;
 
+#ifdef OMPD_SUPPORT
+  // Set ompd parallel info for the next parallel region in the previous task
+  // descriptor
+  ompd_nvptx_parallel_info_t &newPar =
+      currTaskDescr->ompd_ThreadInfo()->enclosed_parallel;
+  newPar.level = currTaskDescr->GetPrevTaskDescr()
+                              ->ompd_ThreadInfo()
+                              ->enclosed_parallel
+                              .level + 1;
+  newPar.parallel_tasks = newTaskDescr;
+#endif
+
   // set new task descriptor as top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
                                                              newTaskDescr);
+#ifdef OMPD_SUPPORT
+  ompd_init_thread_parallel(); // we are still in a prallel region
+  // every thread is a parallel region.. hooray
+  ompd_bp_parallel_begin();
+#endif /*OMPD_SUPPORT*/
 }
 
 EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
@@ -415,6 +460,9 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
   // set new top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
       threadId, currTaskDescr->GetPrevTaskDescr());
+#ifdef OMPD_SUPPORT
+  ompd_bp_parallel_end();
+#endif
   // free
   SafeFree(currTaskDescr, (char *)"new seq parallel task");
 }
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index afa8e81eb..ac1cd8407 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -161,6 +161,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
                                      kmp_InterWarpCopyFctPtr cpyFct,
                                      bool isSPMDExecutionMode,
                                      bool isRuntimeUninitialized = false) {
+  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+  uint32_t NumThreads = GetNumberOfOmpThreads(
+      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
+  if (NumThreads == 1)
+    return 1;
   /*
    * This reduce function handles reduction within a team. It handles
    * parallel regions in both L1 and L2 parallelism levels. It also
@@ -171,11 +176,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
    * 3. Warp 0 reduces to a single value.
    * 4. The reduced value is available in the thread that returns 1.
    */
+#ifdef OMPD_SUPPORT
+    ompd_set_device_thread_state(omp_state_work_reduction);
+#endif /*OMPD_SUPPORT*/
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
   uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
   uint32_t WarpId = BlockThreadId / WARPSIZE;
 
@@ -203,8 +208,17 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
       gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
                                 BlockThreadId);
 
+#ifdef OMPD_SUPPORT
+  ompd_reset_device_thread_state();
+#endif /*OMPD_SUPPORT*/
+
     return BlockThreadId == 0;
   }
+
+#ifdef OMPD_SUPPORT
+  ompd_reset_device_thread_state();
+#endif /*OMPD_SUPPORT*/
+
   return BlockThreadId == 0;
 #else
   uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
@@ -219,10 +233,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
                                     // early.
     return gpu_irregular_simd_reduce(reduce_data, shflFct);
 
-  uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
-  uint32_t NumThreads = GetNumberOfOmpThreads(
-      BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
-
   // When we have more than [warpsize] number of threads
   // a block reduction is performed here.
   //
@@ -243,6 +253,10 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
     return BlockThreadId == 0;
   }
 
+#ifdef OMPD_SUPPORT
+    ompd_reset_device_thread_state();
+#endif /*OMPD_SUPPORT*/
+
   // Get the OMP thread Id. This is different from BlockThreadId in the case of
   // an L2 parallel region.
   return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode,
@@ -289,6 +303,9 @@ int32_t nvptx_teams_reduce_nowait(
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.
+#ifdef OMPD_SUPPORT
+    ompd_set_device_thread_state(omp_state_work_reduction);
+#endif /*OMPD_SUPPORT*/
   uint32_t NumThreads =
       isSPMDExecutionMode
           ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
@@ -403,6 +420,9 @@ int32_t nvptx_teams_reduce_nowait(
   }
 #endif // __CUDA_ARCH__ >= 700
 
+#ifdef OMPD_SUPPORT
+    ompd_reset_device_thread_state();
+#endif /*OMPD_SUPPORT*/
   return ThreadId == 0;
 }
 
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index a577d7a6c..68f08a16a 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -35,40 +35,46 @@ EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
 
 EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
   PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
-  __syncthreads();
+  __kmpc_barrier(loc_ref, tid);
   PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
   return 0;
 }
 
 EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
-  tid = GetLogicalThreadIdInBlock();
-  omptarget_nvptx_TaskDescr *currTaskDescr =
-      omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
-  if (!currTaskDescr->InL2OrHigherParallelRegion()) {
-    int numberOfActiveOMPThreads =
-        GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+  if (isSPMDMode()) {
+    __kmpc_barrier_simple_spmd(loc_ref, tid);
+  } else if (isRuntimeUninitialized()) {
+    __kmpc_barrier_simple_generic(loc_ref, tid);
+  } else {
+    tid = GetLogicalThreadIdInBlock();
+    omptarget_nvptx_TaskDescr *currTaskDescr =
+        omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
+    if (!currTaskDescr->InL2OrHigherParallelRegion()) {
+      int numberOfActiveOMPThreads =
+          GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    // On Volta and newer architectures we require that all lanes in
-    // a warp (at least, all present for the kernel launch) participate in the
-    // barrier.  This is enforced when launching the parallel region.  An
-    // exception is when there are < WARPSIZE workers.  In this case only 1
-    // worker is started, so we don't need a barrier.
-    if (numberOfActiveOMPThreads > 1) {
+      // On Volta and newer architectures we require that all lanes in
+      // a warp (at least, all present for the kernel launch) participate in the
+      // barrier.  This is enforced when launching the parallel region.  An
+      // exception is when there are < WARPSIZE workers.  In this case only 1
+      // worker is started, so we don't need a barrier.
+      if (numberOfActiveOMPThreads > 1) {
 #endif
-      // The #threads parameter must be rounded up to the WARPSIZE.
-      int threads =
-          WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
-
-      PRINT(LD_SYNC,
-            "call kmpc_barrier with %d omp threads, sync parameter %d\n",
-            numberOfActiveOMPThreads, threads);
-      // Barrier #1 is for synchronization among active threads.
-      named_sync(L1_BARRIER, threads);
+        // The #threads parameter must be rounded up to the WARPSIZE.
+        int threads =
+            WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
+
+        PRINT(LD_SYNC,
+              "call kmpc_barrier with %d omp threads, sync parameter %d\n",
+              numberOfActiveOMPThreads, threads);
+        // Barrier #1 is for synchronization among active threads.
+        named_sync(L1_BARRIER, threads);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-    } // numberOfActiveOMPThreads > 1
+      } // numberOfActiveOMPThreads > 1
 #endif
+    }
+    PRINT0(LD_SYNC, "completed kmpc_barrier\n");
   }
-  PRINT0(LD_SYNC, "completed kmpc_barrier\n");
 }
 
 // Emit a simple barrier call in SPMD mode.  Assumes the caller is in an L0
diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/nvptx/src/task.cu
index 8d4796778..76166ea8c 100644
--- a/libomptarget/deviceRTLs/nvptx/src/task.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -97,7 +97,10 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
   newTaskDescr->CopyForExplicitTask(parentTaskDescr);
   // set new task descriptor as top
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
-
+#ifdef OMPD_SUPPORT
+  ompd_init_explicit_task((void*)(newKmpTaskDescr->sub));
+  ompd_bp_task_begin();
+#endif
   // 3. call sub
   PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
         P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
@@ -105,6 +108,10 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
   PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
         P64(newKmpTaskDescr->sub));
 
+#ifdef OMPD_SUPPORT
+  ompd_bp_task_end();
+#endif
+
   // 4. pop context
   omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
                                                              parentTaskDescr);
diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt
index 6c24d0e1d..8c3d57168 100644
--- a/libomptarget/plugins/CMakeLists.txt
+++ b/libomptarget/plugins/CMakeLists.txt
@@ -37,7 +37,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
         
       # Install plugin under the lib destination folder.
       install(TARGETS "omptarget.rtl.${tmachine_libname}" 
-        LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
+        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
         
       target_link_libraries(
         "omptarget.rtl.${tmachine_libname}"
diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt
index 8763065e7..7210eec10 100644
--- a/libomptarget/plugins/cuda/CMakeLists.txt
+++ b/libomptarget/plugins/cuda/CMakeLists.txt
@@ -39,7 +39,7 @@ include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
 add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
 
 # Install plugin under the lib destination folder.
-install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
+install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 
 target_link_libraries(omptarget.rtl.cuda
   ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp
index fe2f9f67c..90048a3eb 100644
--- a/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/libomptarget/plugins/cuda/src/rtl.cpp
@@ -54,6 +54,19 @@ static int DebugLevel = 0;
   {}
 #endif
 
+#if OMPD_SUPPORT
+#ifdef __cplusplus
+extern "C" {
+#endif
+  /* TODO - Put these OMPD globals someplace cleaner */
+  uint64_t ompd_num_cuda_devices;
+  CUcontext* ompd_CudaContextArray;
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* OMPD_SUPPORT */
+
+
 /// Keep entries table per device.
 struct FuncOrGblEntryTy {
   __tgt_target_table Table;
@@ -92,7 +105,7 @@ std::list<KernelTy> KernelsList;
 
 /// Class containing all the device information.
 class RTLDeviceInfoTy {
-  std::vector<FuncOrGblEntryTy> FuncGblEntries;
+  std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
 
 public:
   int NumberOfDevices;
@@ -122,7 +135,7 @@ class RTLDeviceInfoTy {
   void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     E.Entries.push_back(entry);
   }
@@ -131,7 +144,7 @@ class RTLDeviceInfoTy {
   bool findOffloadEntry(int32_t device_id, void *addr) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     for (auto &it : E.Entries) {
       if (it.addr == addr)
@@ -145,7 +158,7 @@ class RTLDeviceInfoTy {
   __tgt_target_table *getOffloadEntriesTable(int32_t device_id) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
 
     int32_t size = E.Entries.size();
 
@@ -167,7 +180,8 @@ class RTLDeviceInfoTy {
   void clearOffloadEntriesTable(int32_t device_id) {
     assert(device_id < (int32_t)FuncGblEntries.size() &&
            "Unexpected device id!");
-    FuncOrGblEntryTy &E = FuncGblEntries[device_id];
+    FuncGblEntries[device_id].emplace_back();
+    FuncOrGblEntryTy &E = FuncGblEntries[device_id].back();
     E.Entries.clear();
     E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
   }
@@ -204,6 +218,10 @@ class RTLDeviceInfoTy {
 
     FuncGblEntries.resize(NumberOfDevices);
     Contexts.resize(NumberOfDevices);
+#if OMPD_SUPPORT
+    ompd_num_cuda_devices = (uint64_t)Contexts.size();
+    ompd_CudaContextArray = &Contexts[0];
+#endif /* OMPD_SUPPORT */
     ThreadsPerBlock.resize(NumberOfDevices);
     BlocksPerGrid.resize(NumberOfDevices);
     WarpSize.resize(NumberOfDevices);
diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt
index 2d606728e..be099f309 100644
--- a/libomptarget/src/CMakeLists.txt
+++ b/libomptarget/src/CMakeLists.txt
@@ -28,4 +28,4 @@ target_link_libraries(omptarget
   "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports")
 
 # Install libomptarget under the lib destination folder.
-install(TARGETS omptarget LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
+install(TARGETS omptarget LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
diff --git a/offload/Makefile b/offload/Makefile
deleted file mode 100755
index 75e3744a6..000000000
--- a/offload/Makefile
+++ /dev/null
@@ -1,224 +0,0 @@
-#
-##//===----------------------------------------------------------------------===//
-#//
-#//                     The LLVM Compiler Infrastructure
-#//
-#// This file is dual licensed under the MIT and the University of Illinois Open
-#// Source Licenses. See LICENSE.txt for details.
-#//
-#//===----------------------------------------------------------------------===//
-#
-
-# MAKEFILE PARAMETERS
-#
-# root_dir - path to root directory of liboffload
-# build_dir - path to build directory
-# mpss_dir - path to root directory of mpss
-# mpss_version - version of the mpss (e.g., version "3.3.x" would be "33")
-# libiomp_host_dir - path to host libiomp directory (unnecessary if compiler_host is icc)
-# libiomp_target_dir - path to target libiomp directory (unnecesarry if compiler_target is icc)
-# omp_header_dir - path to omp.h (unnecessary if compiler_host and compiler_target are icc)
-# os_host - host operating system
-# os_target - target operating system
-# compiler_host - host compiler
-# compiler_target - target compiler
-# options_host - additional options for host compiler
-# options_target - additional options for target compiler
-#
-
-# Directories
-root_dir?=.
-build_dir?=$(root_dir)/build
-build_host_dir=$(build_dir)/host
-build_target_dir=$(build_dir)/target
-obj_host_dir=$(build_dir)/obj_host
-obj_target_dir=$(build_dir)/obj_target
-source_dir=$(root_dir)/src
-imported_dir=$(source_dir)/imported
-
-# OS
-os_host?=linux
-os_target?=linux
-ifneq ($(os_host)_$(os_target), linux_linux)
-  $(error "Only linux is supported")
-endif
-
-# Compilers
-compiler_host?=gcc
-compiler_target?=gcc
-
-# MPSS
-mpss_version?=30
-mpss_dir?=/
-mpss_present=$(shell if test -d $(mpss_dir); then echo OK; else echo KO; fi)
-ifneq ($(mpss_present), OK)
-  $(error "Cannot find MPSS directory $(mpss_dir)")
-endif
-
-ifeq ($(shell test $(mpss_version) -gt 33; echo $$?), 0)
-  coi_dir=$(mpss_dir)/sysroots/k1om-mpss-linux/usr
-  coi_include=$(coi_dir)/include/intel-coi
-  coi_lib_host=$(mpss_dir)/lib64
-  coi_lib_device=$(coi_dir)/lib64
-else
-  coi_dir=$(mpss_dir)/opt/intel/mic/coi
-  coi_include=$(coi_dir)/include
-  coi_lib_host=$(coi_dir)/host-linux-release/lib
-  coi_lib_device=$(coi_dir)/device-linux-release/lib
-endif
-myo_dir=$(mpss_dir)/opt/intel/mic/myo
-
-# Sources
-src_liboffload_common=dv_util.cpp liboffload_error.c liboffload_msg.c offload_common.cpp offload_table.cpp offload_trace.cpp offload_util.cpp
-
-src_liboffload_host=$(src_liboffload_common) cean_util.cpp coi/coi_client.cpp compiler_if_host.cpp offload_engine.cpp offload_env.cpp offload_host.cpp offload_omp_host.cpp offload_timer_host.cpp offload_orsl.cpp orsl-lite/lib/orsl-lite.c offload_myo_host.cpp
-src_liboffload_host:=$(foreach file,$(src_liboffload_host),$(source_dir)/$(file))
-
-src_liboffload_target=$(src_liboffload_common) coi/coi_server.cpp compiler_if_target.cpp offload_omp_target.cpp offload_target.cpp offload_timer_target.cpp offload_myo_target.cpp
-src_liboffload_target:=$(foreach file,$(src_liboffload_target),$(source_dir)/$(file))
-
-src_ofld=ofldbegin.cpp ofldend.cpp
-src_ofld:=$(foreach file,$(src_ofld),$(source_dir)/$(file))
-
-headers=$(wildcard $(source_dir)/*.h) $(wildcard $(source_dir)/coi/*.h) $(wildcard $(source_dir)/orsl-lite/include/*.h)
-ifneq ($(omp_header_dir), )
-  headers+=$(imported_dir)/omp.h
-endif
-
-# Objects
-obj_liboffload_host=$(notdir $(src_liboffload_host))
-obj_liboffload_host:=$(obj_liboffload_host:.cpp=.o)
-obj_liboffload_host:=$(obj_liboffload_host:.c=.o)
-obj_liboffload_host:=$(foreach file,$(obj_liboffload_host),$(obj_host_dir)/$(file))
-
-obj_liboffload_target=$(notdir $(src_liboffload_target))
-obj_liboffload_target:=$(obj_liboffload_target:.cpp=.o)
-obj_liboffload_target:=$(obj_liboffload_target:.c=.o)
-obj_liboffload_target:=$(foreach file,$(obj_liboffload_target),$(obj_target_dir)/$(file))
-
-obj_ofld=$(notdir $(src_ofld))
-obj_ofld:=$(obj_ofld:.cpp=.o)
-obj_ofld_host=$(foreach file,$(obj_ofld),$(build_host_dir)/$(file))
-obj_ofld_target=$(foreach file,$(obj_ofld),$(build_target_dir)/$(file))
-
-# Options
-opts_common=-O2 -w -fpic -c -DCOI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT -I$(coi_include) -I$(myo_dir)/include -I$(source_dir)
-ifneq ($(omp_header_dir), )
-  opts_common+=-I$(imported_dir)
-endif
-
-opts_liboffload=-shared -Wl,-soname,liboffload.so.5 -ldl -lstdc++ -liomp5
-
-opts_liboffload_host=$(opts_liboffload) -L$(coi_lib_host) -lcoi_host -L$(myo_dir)/lib -lmyo-client
-ifneq ($(libiomp_host_dir), )
-  opts_liboffload_host+=-L$(libiomp_host_dir)
-endif
-
-opts_liboffload_target=$(opts_liboffload) -L$(coi_lib_device) -lcoi_device -L$(myo_dir)/lib -lmyo-service
-ifneq ($(libiomp_target_dir), )
-  opts_liboffload_target+=-L$(libiomp_target_dir)
-endif
-
-options_host?=
-opts_host=$(options_host) -DHOST_LIBRARY=1 -DMPSS_VERSION=$(mpss_version)
-ifeq ($(os_host), linux)
-  opts_host+=-DLINUX
-endif
-
-options_target?=
-opts_target=$(options_target) -DHOST_LIBRARY=0
-ifeq ($(os_target), linux)
-  opts_target+=-DLINUX
-endif
-ifeq ($(compiler_target), icc)
-  opts_target+=-mmic
-endif
-
-# Make targets
-.PHONY: all clean info
-
-all: info $(build_host_dir)/liboffload.so $(build_target_dir)/liboffload.so $(obj_ofld_host) $(obj_ofld_target)
-
-
-$(build_host_dir)/liboffload.so: $(build_host_dir)/liboffload.so.5 | $(build_host_dir)
-	ln -f $< $@
-
-$(build_host_dir)/liboffload.so.5: $(obj_liboffload_host) | $(build_host_dir)
-	$(compiler_host) $(opts_liboffload_host) $(opts_host) $^ -o $@
-
-$(obj_host_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_host_dir)
-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@
-
-$(obj_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_host_dir)
-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@
-
-$(obj_host_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_host_dir)
-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@
-
-$(obj_host_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_host_dir)
-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@
-
-
-$(build_target_dir)/liboffload.so: $(build_target_dir)/liboffload.so.5 | $(build_target_dir)
-	ln -f $< $@
-
-$(build_target_dir)/liboffload.so.5: $(obj_liboffload_target) | $(build_target_dir)
-	$(compiler_target) $(opts_liboffload_target) $(opts_target) $^ -o $@
-
-$(obj_target_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_target_dir)
-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@
-
-$(obj_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_target_dir)
-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@
-
-$(obj_target_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_target_dir)
-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@
-
-$(obj_target_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_target_dir)
-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@
-
-
-$(build_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_host_dir)
-	$(compiler_host) $(opts_common) $(opts_host) $< -o $@
-
-$(build_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_target_dir)
-	$(compiler_target) $(opts_common) $(opts_target) $< -o $@
-
-
-$(imported_dir)/omp.h: $(omp_header_dir)/omp.h | $(imported_dir)
-	cp $< $@
-
-
-$(build_host_dir) $(build_target_dir) $(obj_host_dir) $(obj_target_dir): | $(build_dir)
-	$(shell mkdir -p $@ >/dev/null 2>/dev/null)
-	@echo "Created $@ directory"
-
-$(build_dir):
-	$(shell mkdir -p $@ >/dev/null 2>/dev/null)
-	@echo "Created $@ directory"
-
-$(imported_dir):
-	$(shell mkdir -p $@ >/dev/null 2>/dev/null)
-	@echo "Created $@ directory"
-
-
-clean:
-	$(shell rm -rf $(build_dir))
-	@echo "Remove $(build_dir) directory"
-
-
-info:
-	@echo "root_dir = $(root_dir)"
-	@echo "build_dir = $(build_dir)"
-	@echo "mpss_dir = $(mpss_dir)"
-	@echo "mpss_version = $(mpss_version)"
-	@echo "libiomp_host_dir = $(libiomp_host_dir)"
-	@echo "libiomp_target_dir = $(libiomp_target_dir)"
-	@echo "omp_header_dir = $(omp_header_dir)"
-	@echo "os_host = $(os_host)"
-	@echo "os_target = $(os_target)"
-	@echo "compiler_host = $(compiler_host)"
-	@echo "compiler_target = $(compiler_target)"
-	@echo "options_host = $(options_host)"
-	@echo "options_target = $(options_target)"
-
diff --git a/offload/README.txt b/offload/README.txt
deleted file mode 100755
index eb9fb1da3..000000000
--- a/offload/README.txt
+++ /dev/null
@@ -1,129 +0,0 @@
-
-	       README for Intel(R) Offload Runtime Library
-	       ===========================================
-
-How to Build Documentation
-==========================
-
-The main documentation is in Doxygen* format, and this distribution
-should come with pre-built PDF documentation in doc/Reference.pdf.
-However, an HTML version can be built by executing:
-
-% doxygen doc/doxygen/config
-
-in this directory.
-
-That will produce HTML documentation in the doc/doxygen/generated
-directory, which can be accessed by pointing a web browser at the
-index.html file there.
-
-If you don't have Doxygen installed, you can download it from
-www.doxygen.org.
-
-
-Software Requirements
-=====================
-
-Intel(R) Offload Runtime Library requires additional software:
-
-1) Intel(R) OpenMP* Runtime Library.  You can either download the source
-code for that (from openmprtl.org or openmp.llvm.org) or simply use the
-compiled version distributed with the Intel compilers.
-2) Intel(R) COI Runtime Library and Intel(R) MYO Runtime Library.  These
-libraries are part of Intel(R) Manycore Platform Software Stack (MPSS).  You
-can download MPSS source code or binaries from
-software.intel.com/en-us/articles/intel-manycore-platform-software-stack-mpss.
-Binaries include host libraries for Intel(R) 64 Architecture and target
-libraries for Intel(R) Many Integrated Core Architecture.
-
-Also you will require all of the libraries that enable the target code to run
-on device.  If you target the Intel(R) Xeon Phi (TM) coprocessor, these
-libraries can be taken from MPSS too.
-
-
-How to Build the Intel(R) Offload Runtime Library
-=================================================
-
-The Makefile at the top-level will attempt to detect what it needs to
-build the Intel(R) Offload Runtime Library.  To see the default settings,
-type:
-
-make info
-
-You can change the Makefile's behavior with the following options:
-
-root_dir:	      The path to the top-level directory containing the
-		      top-level Makefile.  By default, this will take on the
-		      value of the current working directory.
-
-build_dir:	      The path to the build directory.  By default, this will
-		      take on value [root_dir]/build.
-
-mpss_dir:	      The path to the Intel(R) Manycore Platform Software
-		      Stack install directory.  By default, this will take on
-		      the value of operating system's root directory.
-
-libiomp_host_dir:     The path to the host Intel(R) OpenMP* Runtime Library.
-		      This option is required when the host compiler is other
-		      than icc.
-
-libiomp_target_dir:   The path to the target Intel(R) OpenMP* Runtime
-		      Library.  This option is required when the target
-		      compiler is other than icc.
-
-omp_header_dir:       The path to the header file <omp.h> of Intel(R) OpenMP*
-		      Runtime Library.  This option is required if either host
-		      or target compiler is other than icc.
-
-os_host:	      Operating system on host.  Currently supports only
-		      "linux" which is set by default.
-
-os_target:	      Operating system on target device.  Currently supports
-		      only "linux" which is set by default.
-
-compiler_host:	      Which compiler to use for the build of the host part.
-		      Defaults to "gcc"*.  Also supports "icc" and "clang"*.
-		      You should provide the full path to the compiler or it
-		      should be in the user's path.
-
-compiler_host:	      Which compiler to use for the build of the target part.
-		      Defaults to "gcc"*.  Also supports "icc" and "clang"*.
-		      You should provide the full path to the compiler or it
-		      should be in the user's path.
-
-options_host:	      Additional options for the host compiler.
-
-options_target:       Additional options for the target compiler.
-
-To use any of the options above, simple add <option_name>=<value>.  For
-example, if you want to build with icc instead of gcc, type:
-
-make compiler_host=icc compiler_target=icc
-
-
-Supported RTL Build Configurations
-==================================
-
-Supported Architectures: Intel(R) 64, and Intel(R) Many Integrated
-Core Architecture
-
-	      ---------------------------------------------
-	      |   icc/icl     |    gcc      |    clang    |
---------------|---------------|---------------------------|
-| Linux* OS   |      Yes      |     Yes(1)  |     Yes(1)  |
-| OS X*       |       No      |      No     |      No     |
-| Windows* OS |       No      |      No     |      No     |
------------------------------------------------------------
-
-(1) Liboffload requires _rdtsc intrinsic, which may be unsupported by some
-    versions of compiler.  In this case you need to include src/rdtsc.h
-    manually by using Makefile options options_host and options_target:
-
-    make options_host="-include src/rdtsc.h" options_target="-include src/rdtsc.h"
-
------------------------------------------------------------------------
-
-Notices
-=======
-
-*Other names and brands may be claimed as the property of others.
diff --git a/offload/doc/Reference.pdf b/offload/doc/Reference.pdf
deleted file mode 100644
index b9176f07f..000000000
Binary files a/offload/doc/Reference.pdf and /dev/null differ
diff --git a/offload/doc/doxygen/config b/offload/doc/doxygen/config
deleted file mode 100755
index 275258f76..000000000
--- a/offload/doc/doxygen/config
+++ /dev/null
@@ -1,2328 +0,0 @@
-# Doxyfile 1.8.6
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "Intel&reg;&nbsp;Offload Runtime Library"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         = 
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          = 
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is included in
-# the documentation. The maximum height of the logo should not exceed 55 pixels
-# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo
-# to the output directory.
-
-PROJECT_LOGO           = 
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = doc/doxygen/generated
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        = src/
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    = src/
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a
-# new page for each member. If set to NO, the documentation of a member will be
-# part of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 8
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                = 
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              = 
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make
-# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
-# (default is Fortran), use: inc=Fortran f=C.
-#
-# Note For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      = 
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by by putting a % sign in front of the word
-# or globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = YES
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. When set to YES local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO these classes will be included in the various overviews. This option has
-# no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = NO
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the
-# todo list. This list is created by putting \todo commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the
-# test list. This list is created by putting \test commands in the
-# documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       = 
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES the list
-# will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    = 
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            = 
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. Do not use file names with spaces, bibtex cannot handle them. See
-# also \cite for info how to create references.
-
-CITE_BIB_FILES         = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO doxygen will only warn about wrong or incomplete parameter
-# documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = src
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank the
-# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii,
-# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp,
-# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown,
-# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf,
-# *.qsf, *.as and *.js.
-
-FILE_PATTERNS          = *.c *.h *.cpp *.f90
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = src/imported src/rdtsc.h
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       = 
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        = 
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           = 
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             = 
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-
-INPUT_FILTER           = 
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        = 
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER ) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS = 
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = YES
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES, then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more acurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot of
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            = 
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            = 
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        = 
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user-
-# defined cascading style sheet that is included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
-# Doxygen will copy the style sheet file to the output directory. For an example
-# see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  = 
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       = 
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the stylesheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               = 
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler ( hhc.exe). If non-empty
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           = 
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated (
-# YES) or that it should be included in the master .chm file ( NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     = 
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated (
-# YES) or a normal table of contents ( NO) in the .chm file.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               = 
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   = 
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  = 
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  = 
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           = 
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using prerendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     = 
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       = 
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavours of web server based searching depending on the
-# EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for
-# searching and an index file used by the script. When EXTERNAL_SEARCH is
-# enabled the indexing and searching needs to be provided by external tools. See
-# the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer ( doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       = 
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     = 
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = YES
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. To get the times font for
-# instance you can specify
-# EXTRA_PACKAGES=times
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         = 
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber. Doxygen will
-# replace them by respectively the title of the page, the current date and time,
-# only the current date, the version number of doxygen, the project name (see
-# PROJECT_NAME), or the project number (see PROJECT_NUMBER).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           = doc/doxygen/header.tex
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           = 
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      = 
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the LATEX_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    = 
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify a XML schema, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_SCHEMA             = 
-
-# The XML_DTD tag can be used to specify a XML DTD, which can be used by a
-# validating XML parser to check the syntax of the XML files.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_DTD                = 
-
-# If the XML_PROGRAMLISTING tag is set to YES doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES doxygen will generate an AutoGen
-# Definitions (see http://autogen.sf.net) file that captures the structure of
-# the code including all documentation. Note that this feature is still
-# experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES doxygen will expand all macro names
-# in the source code. If set to NO only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = YES
-
-# If the SEARCH_INCLUDES tag is set to YES the includes files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           = 
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  = 
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             = COI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      = 
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have an
-# all uppercase name, and do not end with a semicolon. Such function macros are
-# typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have an unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               = 
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       = 
-
-# If the ALLEXTERNALS tag is set to YES all external class will be listed in the
-# class index. If set to NO only the inherited external classes will be listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed in
-# the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            = 
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               = 
-
-# If set to YES, the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: NO.
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font n the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           = 
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot.
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, jpg, gif and svg.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = NO
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               = 
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           = 
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           = 
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           = 
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/offload/doc/doxygen/header.tex b/offload/doc/doxygen/header.tex
deleted file mode 100755
index 5e963c2c4..000000000
--- a/offload/doc/doxygen/header.tex
+++ /dev/null
@@ -1,90 +0,0 @@
-% Latex header for doxygen 1.8.3.1
-\documentclass{book}
-\usepackage[a4paper,top=2.5cm,bottom=2.5cm,left=2.5cm,right=2.5cm]{geometry}
-\usepackage{makeidx}
-\usepackage{natbib}
-\usepackage{graphicx}
-\usepackage{multicol}
-\usepackage{float}
-\usepackage{listings}
-\usepackage{color}
-\usepackage{ifthen}
-\usepackage[table]{xcolor}
-\usepackage{textcomp}
-\usepackage{alltt}
-\usepackage{ifpdf}
-\ifpdf
-\usepackage[pdftex,
-            pagebackref=true,
-            colorlinks=true,
-            linkcolor=blue,
-            unicode
-           ]{hyperref}
-\else
-\usepackage[ps2pdf,
-            pagebackref=true,
-            colorlinks=true,
-            linkcolor=blue,
-            unicode
-           ]{hyperref}
-\usepackage{pspicture}
-\fi
-\usepackage[utf8]{inputenc}
-\usepackage{mathptmx}
-\usepackage[scaled=.90]{helvet}
-\usepackage{courier}
-\usepackage{sectsty}
-\usepackage{amssymb}
-\usepackage[titles]{tocloft}
-\usepackage{doxygen}
-\usepackage{fancyhdr}
-\pagestyle{fancy}
-\lstset{language=C++,inputencoding=utf8,basicstyle=\footnotesize,breaklines=true,breakatwhitespace=true,tabsize=4,numbers=left }
-\makeindex
-\setcounter{tocdepth}{3}
-\renewcommand{\footrulewidth}{0.4pt}
-\renewcommand{\familydefault}{\sfdefault}
-\hfuzz=15pt
-\setlength{\emergencystretch}{15pt}
-\hbadness=750
-\tolerance=750
-\begin{document}
-\hypersetup{pageanchor=false,citecolor=blue}
-\begin{titlepage}
-\vspace*{7cm}
-\begin{center}
-{\Large Intel\textsuperscript{\textregistered} Offload Runtime Library }\\
-\vspace*{1cm}
-{\large Generated by Doxygen $doxygenversion }\\
-\vspace*{0.5cm}
-{\small $datetime }\\
-\end{center}
-\end{titlepage}
-
-{\bf FTC Optimization Notice}
-
-Intel's compilers may or may not optimize to the same degree for non-Intel microprocessors for
-optimizations that are not unique to Intel microprocessors. These optimizations include SSE2,
-SSE3, and SSSE3 instruction sets and other optimizations. Intel does not guarantee the
-availability, functionality, or effectiveness of any optimization on microprocessors not
-manufactured by Intel.
-
-Microprocessor-dependent optimizations in this product are intended for use with Intel
-microprocessors. Certain optimizations not specific to Intel microarchitecture are reserved for
-Intel microprocessors. Please refer to the applicable product User and Reference Guides for
-more information regarding the specific instruction sets covered by this notice.
-
-Notice revision \#20110804
-
-\vspace*{0.5cm}
-
-{\bf Trademarks}
-
-Intel, Xeon, and Intel Xeon Phi are trademarks of Intel Corporation in the U.S. and/or other countries.
-
-This document is Copyright \textcopyright 2014, Intel Corporation. All rights reserved. 
-
-\pagenumbering{roman}
-\tableofcontents
-\pagenumbering{arabic}
-\hypersetup{pageanchor=true,citecolor=blue}
diff --git a/offload/src/cean_util.cpp b/offload/src/cean_util.cpp
deleted file mode 100644
index fe1890b71..000000000
--- a/offload/src/cean_util.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "cean_util.h"
-#include "offload_common.h"
-
-// 1. allocate element of CeanReadRanges type
-// 2. initialized it for reading consequently contiguous ranges
-//    described by "ap" argument
-CeanReadRanges * init_read_ranges_arr_desc(const arr_desc *ap)
-{
-    CeanReadRanges * res;
-
-    // find the max contiguous range
-    int64_t rank = ap->rank - 1;
-    int64_t length = ap->dim[rank].size;
-    for (; rank >= 0; rank--) {
-        if (ap->dim[rank].stride == 1) {
-            length *= (ap->dim[rank].upper - ap->dim[rank].lower + 1);
-            if (rank > 0 && length != ap->dim[rank - 1].size) {
-                break;
-            }
-        }
-        else {
-            break;
-        }
-    }
-
-    res =(CeanReadRanges *)malloc(sizeof(CeanReadRanges) +
-                                  (ap->rank - rank) * sizeof(CeanReadDim));
-    res->current_number = 0;
-    res->range_size = length;
-    res->last_noncont_ind = rank;
-
-    // calculate number of contiguous ranges inside noncontiguous dimensions
-    int count = 1;
-    bool prev_is_cont = true;
-    int64_t offset = 0;
-
-    for (; rank >= 0; rank--) {
-        res->Dim[rank].count = count;
-        res->Dim[rank].size = ap->dim[rank].stride * ap->dim[rank].size;
-        count *= (prev_is_cont && ap->dim[rank].stride == 1? 1 :
-            (ap->dim[rank].upper - ap->dim[rank].lower +
-            ap->dim[rank].stride) / ap->dim[rank].stride);
-        prev_is_cont = false;
-        offset +=(ap->dim[rank].lower - ap->dim[rank].lindex) *
-                 ap->dim[rank].size;
-    }
-    res->range_max_number = count;
-    res -> ptr = (void*)ap->base;
-    res -> init_offset = offset;
-    return res;
-}
-
-// check if ranges described by 1 argument could be transferred into ranges
-// described by 2-nd one
-bool cean_ranges_match(
-    CeanReadRanges * read_rng1,
-    CeanReadRanges * read_rng2
-)
-{
-    return ( read_rng1 == NULL || read_rng2 == NULL ||
-            (read_rng1->range_size % read_rng2->range_size == 0 ||
-            read_rng2->range_size % read_rng1->range_size == 0));
-}
-
-// Set next offset and length and returns true for next range.
-// Returns false if the ranges are over.
-bool get_next_range(
-    CeanReadRanges * read_rng,
-    int64_t *offset
-)
-{
-    if (++read_rng->current_number > read_rng->range_max_number) {
-        read_rng->current_number = 0;
-        return false;
-    }
-    int rank = 0;
-    int num = read_rng->current_number - 1;
-    int64_t cur_offset = 0;
-    int num_loc;
-    for (; rank <= read_rng->last_noncont_ind; rank++) {
-        num_loc = num / read_rng->Dim[rank].count;
-        cur_offset += num_loc * read_rng->Dim[rank].size;
-        num = num % read_rng->Dim[rank].count;
-    }
-    *offset = cur_offset + read_rng->init_offset;
-    return true;
-}
-
-bool is_arr_desc_contiguous(const arr_desc *ap)
-{
-    int64_t rank = ap->rank - 1;
-    int64_t length = ap->dim[rank].size;
-    for (; rank >= 0; rank--) {
-        if (ap->dim[rank].stride > 1 &&
-            ap->dim[rank].upper - ap->dim[rank].lower != 0) {
-                return false;
-        }
-        else if (length != ap->dim[rank].size) {
-            for (; rank >= 0; rank--) {
-                if (ap->dim[rank].upper - ap->dim[rank].lower != 0) {
-                    return false;
-                }
-            }
-            return true;
-        }
-        length *= (ap->dim[rank].upper - ap->dim[rank].lower + 1);
-    }
-    return true;
-}
-
-int64_t cean_get_transf_size(CeanReadRanges * read_rng)
-{
-    return(read_rng->range_max_number * read_rng->range_size);
-}
-
-static uint64_t last_left, last_right;
-typedef void (*fpp)(const char *spaces, uint64_t low, uint64_t high, int esize);
-
-static void generate_one_range(
-    const char *spaces,
-    uint64_t lrange,
-    uint64_t rrange,
-    fpp fp,
-    int esize
-)
-{
-    OFFLOAD_TRACE(3,
-        "%s    generate_one_range(lrange=%p, rrange=%p, esize=%d)\n",
-        spaces, (void*)lrange, (void*)rrange, esize);
-    if (last_left == -1) {
-        // First range
-        last_left = lrange;
-    }
-    else {
-        if (lrange == last_right+1) {
-            // Extend previous range, don't print
-        }
-        else {
-            (*fp)(spaces, last_left, last_right, esize);
-            last_left = lrange;
-        }
-    }
-    last_right = rrange;
-}
-
-static void generate_mem_ranges_one_rank(
-    const char *spaces,
-    uint64_t base,
-    uint64_t rank,
-    const struct dim_desc *ddp,
-    fpp fp,
-    int esize
-)
-{
-    uint64_t lindex = ddp->lindex;
-    uint64_t lower = ddp->lower;
-    uint64_t upper = ddp->upper;
-    uint64_t stride = ddp->stride;
-    uint64_t size = ddp->size;
-    OFFLOAD_TRACE(3,
-        "%s    "
-        "generate_mem_ranges_one_rank(base=%p, rank=%lld, lindex=%lld, "
-        "lower=%lld, upper=%lld, stride=%lld, size=%lld, esize=%d)\n",
-        spaces, (void*)base, rank, lindex, lower, upper, stride, size, esize);
-    if (rank == 1) {
-        uint64_t lrange, rrange;
-        if (stride == 1) {
-            lrange = base + (lower-lindex)*size;
-            rrange = lrange + (upper-lower+1)*size - 1;
-            generate_one_range(spaces, lrange, rrange, fp, esize);
-        }
-        else {
-            for (int i=lower-lindex; i<=upper-lindex; i+=stride) {
-                lrange = base + i*size;
-                rrange = lrange + size - 1;
-                generate_one_range(spaces, lrange, rrange, fp, esize);
-            }
-        }
-    }
-    else {
-        for (int i=lower-lindex; i<=upper-lindex; i+=stride) {
-            generate_mem_ranges_one_rank(
-                spaces, base+i*size, rank-1, ddp+1, fp, esize);
-
-        }
-    }
-}
-
-static void generate_mem_ranges(
-    const char *spaces,
-    const arr_desc *adp,
-    bool deref,
-    fpp fp
-)
-{
-    uint64_t esize;
-
-    OFFLOAD_TRACE(3,
-        "%s    "
-        "generate_mem_ranges(adp=%p, deref=%d, fp)\n",
-        spaces, adp, deref);
-    last_left = -1;
-    last_right = -2;
-
-    // Element size is derived from last dimension
-    esize = adp->dim[adp->rank-1].size;
-
-    generate_mem_ranges_one_rank(
-        // For c_cean_var the base addr is the address of the data
-        // For c_cean_var_ptr the base addr is dereferenced to get to the data
-        spaces, deref ? *((uint64_t*)(adp->base)) : adp->base,
-        adp->rank, &adp->dim[0], fp, esize);
-    (*fp)(spaces, last_left, last_right, esize);
-}
-
-// returns offset and length of the data to be transferred
-void __arr_data_offset_and_length(
-    const arr_desc *adp,
-    int64_t &offset,
-    int64_t &length
-)
-{
-    int64_t rank = adp->rank - 1;
-    int64_t size = adp->dim[rank].size;
-    int64_t r_off = 0; // offset from right boundary
-
-    // find the rightmost dimension which takes just part of its
-    // range. We define it if the size of left rank is not equal
-    // the range's length between upper and lower boungaries
-    while (rank > 0) {
-        size *= (adp->dim[rank].upper - adp->dim[rank].lower + 1);
-        if (size != adp->dim[rank - 1].size) {
-            break;
-        }
-        rank--;
-    }
-
-    offset = (adp->dim[rank].lower - adp->dim[rank].lindex) *
-             adp->dim[rank].size;
-
-    // find gaps both from the left - offset and from the right - r_off
-    for (rank--; rank >= 0; rank--) {
-        offset += (adp->dim[rank].lower - adp->dim[rank].lindex) *
-                  adp->dim[rank].size;
-        r_off += adp->dim[rank].size -
-                 (adp->dim[rank + 1].upper - adp->dim[rank + 1].lindex + 1) *
-                 adp->dim[rank + 1].size;
-    }
-    length = (adp->dim[0].upper - adp->dim[0].lindex + 1) *
-             adp->dim[0].size - offset - r_off;
-}
-
-#if OFFLOAD_DEBUG > 0
-
-void print_range(
-    const char *spaces,
-    uint64_t low,
-    uint64_t high,
-    int esize
-)
-{
-    char buffer[1024];
-    char number[32];
-
-    OFFLOAD_TRACE(3, "%s        print_range(low=%p, high=%p, esize=%d)\n",
-        spaces, (void*)low, (void*)high, esize);
-
-    if (console_enabled < 4) {
-        return;
-    }
-    OFFLOAD_TRACE(4, "%s            values:\n", spaces);
-    int count = 0;
-    buffer[0] = '\0';
-    while (low <= high)
-    {
-        switch (esize)
-        {
-        case 1:
-            sprintf(number, "%d ", *((char *)low));
-            low += 1;
-            break;
-        case 2:
-            sprintf(number, "%d ", *((short *)low));
-            low += 2;
-            break;
-        case 4:
-            sprintf(number, "%d ", *((int *)low));
-            low += 4;
-            break;
-        default:
-            sprintf(number, "0x%016x ", *((uint64_t *)low));
-            low += 8;
-            break;
-        }
-        strcat(buffer, number);
-        count++;
-        if (count == 10) {
-            OFFLOAD_TRACE(4, "%s            %s\n", spaces, buffer);
-            count = 0;
-            buffer[0] = '\0';
-        }
-    }
-    if (count != 0) {
-        OFFLOAD_TRACE(4, "%s            %s\n", spaces, buffer);
-    }
-}
-
-void __arr_desc_dump(
-    const char *spaces,
-    const char *name,
-    const arr_desc *adp,
-    bool deref
-)
-{
-    OFFLOAD_TRACE(2, "%s%s CEAN expression %p\n", spaces, name, adp);
-
-    if (adp != 0) {
-        OFFLOAD_TRACE(2, "%s    base=%llx, rank=%lld\n",
-            spaces, adp->base, adp->rank);
-
-        for (int i = 0; i < adp->rank; i++) {
-            OFFLOAD_TRACE(2,
-                          "%s    dimension %d: size=%lld, lindex=%lld, "
-                          "lower=%lld, upper=%lld, stride=%lld\n",
-                          spaces, i, adp->dim[i].size, adp->dim[i].lindex,
-                          adp->dim[i].lower, adp->dim[i].upper,
-                          adp->dim[i].stride);
-        }
-        // For c_cean_var the base addr is the address of the data
-        // For c_cean_var_ptr the base addr is dereferenced to get to the data
-        generate_mem_ranges(spaces, adp, deref, &print_range);
-    }
-}
-#endif // OFFLOAD_DEBUG
diff --git a/offload/src/cean_util.h b/offload/src/cean_util.h
deleted file mode 100644
index d0debcc4b..000000000
--- a/offload/src/cean_util.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef CEAN_UTIL_H_INCLUDED
-#define CEAN_UTIL_H_INCLUDED
-
-#if MPSS_VERSION > 33
-#include <source/COIBuffer_source.h>
-#endif
-#include <stdint.h>
-
-#if MPSS_VERSION <= 33
-// CEAN expression representation
-struct dim_desc {
-    int64_t size;       // Length of data type
-    int64_t lindex;     // Lower index
-    int64_t lower;      // Lower section bound
-    int64_t upper;      // Upper section bound
-    int64_t stride;     // Stride
-};
-
-struct arr_desc {
-    int64_t base;       // Base address
-    int64_t rank;       // Rank of array
-    dim_desc dim[1];
-};
-#endif
-
-struct CeanReadDim {
-    int64_t count; // The number of elements in this dimension
-    int64_t size;  // The number of bytes between successive
-                   // elements in this dimension.
-};
-
-struct CeanReadRanges {
-    void *  ptr;
-    int64_t current_number;   // the number of ranges read
-    int64_t range_max_number; // number of contiguous ranges
-    int64_t range_size;       // size of max contiguous range
-    int     last_noncont_ind; // size of Dim array
-    int64_t init_offset;      // offset of 1-st element from array left bound
-    CeanReadDim Dim[1];
-};
-
-// array descriptor length
-#define __arr_desc_length(rank) \
-    (sizeof(int64_t) + sizeof(dim_desc) * (rank))
-
-// returns offset and length of the data to be transferred
-void __arr_data_offset_and_length(const arr_desc *adp,
-                                  int64_t &offset,
-                                  int64_t &length);
-
-// define if data array described by argument is contiguous one
-bool is_arr_desc_contiguous(const arr_desc *ap);
-
-// allocate element of CeanReadRanges type initialized
-// to read consequently contiguous ranges described by "ap" argument
-CeanReadRanges * init_read_ranges_arr_desc(const arr_desc *ap);
-
-// check if ranges described by 1 argument could be transferred into ranges
-// described by 2-nd one
-bool cean_ranges_match(
-    CeanReadRanges * read_rng1,
-    CeanReadRanges * read_rng2
-);
-
-// first argument - returned value by call to init_read_ranges_arr_desc.
-// returns true if offset and length of next range is set successfuly.
-// returns false if the ranges is over.
-bool get_next_range(
-    CeanReadRanges * read_rng,
-    int64_t *offset
-);
-
-// returns number of transferred bytes
-int64_t cean_get_transf_size(CeanReadRanges * read_rng);
-
-#if OFFLOAD_DEBUG > 0
-// prints array descriptor contents to stderr
-void    __arr_desc_dump(
-    const char *spaces,
-    const char *name,
-    const arr_desc *adp,
-    bool dereference);
-#else
-#define __arr_desc_dump(
-    spaces,
-    name,
-    adp,
-    dereference)
-#endif // OFFLOAD_DEBUG
-
-#endif // CEAN_UTIL_H_INCLUDED
diff --git a/offload/src/coi/coi_client.cpp b/offload/src/coi/coi_client.cpp
deleted file mode 100644
index ab8c7f5c5..000000000
--- a/offload/src/coi/coi_client.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The COI host interface
-
-#include "coi_client.h"
-#include "../offload_common.h"
-
-namespace COI {
-
-#define COI_VERSION1    "COI_1.0"
-#define COI_VERSION2    "COI_2.0"
-
-bool            is_available;
-static void*    lib_handle;
-
-// pointers to functions from COI library
-COIRESULT (*EngineGetCount)(COI_ISA_TYPE, uint32_t*);
-COIRESULT (*EngineGetHandle)(COI_ISA_TYPE, uint32_t, COIENGINE*);
-
-COIRESULT (*ProcessCreateFromMemory)(COIENGINE, const char*, const void*,
-                                     uint64_t, int, const char**, uint8_t,
-                                     const char**, uint8_t, const char*,
-                                     uint64_t, const char*, const char*,
-                                     uint64_t, COIPROCESS*);
-COIRESULT (*ProcessDestroy)(COIPROCESS, int32_t, uint8_t, int8_t*, uint32_t*);
-COIRESULT (*ProcessGetFunctionHandles)(COIPROCESS, uint32_t, const char**,
-                                       COIFUNCTION*);
-COIRESULT (*ProcessLoadLibraryFromMemory)(COIPROCESS, const void*, uint64_t,
-                                          const char*, const char*,
-                                          const char*, uint64_t, uint32_t,
-                                          COILIBRARY*);
-COIRESULT (*ProcessRegisterLibraries)(uint32_t, const void**, const uint64_t*,
-                                      const char**, const uint64_t*);
-
-COIRESULT (*PipelineCreate)(COIPROCESS, COI_CPU_MASK, uint32_t, COIPIPELINE*);
-COIRESULT (*PipelineDestroy)(COIPIPELINE);
-COIRESULT (*PipelineRunFunction)(COIPIPELINE, COIFUNCTION, uint32_t,
-                                 const COIBUFFER*, const COI_ACCESS_FLAGS*,
-                                 uint32_t, const COIEVENT*, const void*,
-                                 uint16_t, void*, uint16_t, COIEVENT*);
-
-COIRESULT (*BufferCreate)(uint64_t, COI_BUFFER_TYPE, uint32_t, const void*,
-                          uint32_t, const COIPROCESS*, COIBUFFER*);
-COIRESULT (*BufferCreateFromMemory)(uint64_t, COI_BUFFER_TYPE, uint32_t,
-                                    void*, uint32_t, const COIPROCESS*,
-                                    COIBUFFER*);
-COIRESULT (*BufferDestroy)(COIBUFFER);
-COIRESULT (*BufferMap)(COIBUFFER, uint64_t, uint64_t, COI_MAP_TYPE, uint32_t,
-                       const COIEVENT*, COIEVENT*, COIMAPINSTANCE*, void**);
-COIRESULT (*BufferUnmap)(COIMAPINSTANCE, uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferWrite)(COIBUFFER, uint64_t, const void*, uint64_t,
-                         COI_COPY_TYPE, uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferRead)(COIBUFFER, uint64_t, void*, uint64_t, COI_COPY_TYPE,
-                        uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferCopy)(COIBUFFER, COIBUFFER, uint64_t, uint64_t, uint64_t,
-                        COI_COPY_TYPE, uint32_t, const COIEVENT*, COIEVENT*);
-COIRESULT (*BufferGetSinkAddress)(COIBUFFER, uint64_t*);
-COIRESULT (*BufferSetState)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
-                            COI_BUFFER_MOVE_FLAG, uint32_t,
-                            const   COIEVENT*, COIEVENT*);
-
-COIRESULT (*EventWait)(uint16_t, const COIEVENT*, int32_t, uint8_t, uint32_t*,
-                       uint32_t*);
-
-uint64_t  (*PerfGetCycleFrequency)(void);
-
-bool init(void)
-{
-#ifndef TARGET_WINNT
-    const char *lib_name = "libcoi_host.so.0";
-#else // TARGET_WINNT
-    const char *lib_name = "coi_host.dll";
-#endif // TARGET_WINNT
-
-    OFFLOAD_DEBUG_TRACE(2, "Loading COI library %s ...\n", lib_name);
-    lib_handle = DL_open(lib_name);
-    if (lib_handle == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to load the library\n");
-        return false;
-    }
-
-    EngineGetCount =
-        (COIRESULT (*)(COI_ISA_TYPE, uint32_t*))
-            DL_sym(lib_handle, "COIEngineGetCount", COI_VERSION1);
-    if (EngineGetCount == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIEngineGetCount");
-        fini();
-        return false;
-    }
-
-    EngineGetHandle =
-        (COIRESULT (*)(COI_ISA_TYPE, uint32_t, COIENGINE*))
-            DL_sym(lib_handle, "COIEngineGetHandle", COI_VERSION1);
-    if (EngineGetHandle == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIEngineGetHandle");
-        fini();
-        return false;
-    }
-
-    ProcessCreateFromMemory =
-        (COIRESULT (*)(COIENGINE, const char*, const void*, uint64_t, int,
-                       const char**, uint8_t, const char**, uint8_t,
-                       const char*, uint64_t, const char*, const char*,
-                       uint64_t, COIPROCESS*))
-            DL_sym(lib_handle, "COIProcessCreateFromMemory", COI_VERSION1);
-    if (ProcessCreateFromMemory == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessCreateFromMemory");
-        fini();
-        return false;
-    }
-
-    ProcessDestroy =
-        (COIRESULT (*)(COIPROCESS, int32_t, uint8_t, int8_t*,
-                       uint32_t*))
-            DL_sym(lib_handle, "COIProcessDestroy", COI_VERSION1);
-    if (ProcessDestroy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessDestroy");
-        fini();
-        return false;
-    }
-
-    ProcessGetFunctionHandles =
-        (COIRESULT (*)(COIPROCESS, uint32_t, const char**, COIFUNCTION*))
-            DL_sym(lib_handle, "COIProcessGetFunctionHandles", COI_VERSION1);
-    if (ProcessGetFunctionHandles == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessGetFunctionHandles");
-        fini();
-        return false;
-    }
-
-    ProcessLoadLibraryFromMemory =
-        (COIRESULT (*)(COIPROCESS, const void*, uint64_t, const char*,
-                       const char*, const char*, uint64_t, uint32_t,
-                       COILIBRARY*))
-            DL_sym(lib_handle, "COIProcessLoadLibraryFromMemory", COI_VERSION2);
-    if (ProcessLoadLibraryFromMemory == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessLoadLibraryFromMemory");
-        fini();
-        return false;
-    }
-
-    ProcessRegisterLibraries =
-        (COIRESULT (*)(uint32_t, const void**, const uint64_t*, const char**,
-                       const uint64_t*))
-            DL_sym(lib_handle, "COIProcessRegisterLibraries", COI_VERSION1);
-    if (ProcessRegisterLibraries == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIProcessRegisterLibraries");
-        fini();
-        return false;
-    }
-
-    PipelineCreate =
-        (COIRESULT (*)(COIPROCESS, COI_CPU_MASK, uint32_t, COIPIPELINE*))
-            DL_sym(lib_handle, "COIPipelineCreate", COI_VERSION1);
-    if (PipelineCreate == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPipelineCreate");
-        fini();
-        return false;
-    }
-
-    PipelineDestroy =
-        (COIRESULT (*)(COIPIPELINE))
-            DL_sym(lib_handle, "COIPipelineDestroy", COI_VERSION1);
-    if (PipelineDestroy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPipelineDestroy");
-        fini();
-        return false;
-    }
-
-    PipelineRunFunction =
-        (COIRESULT (*)(COIPIPELINE, COIFUNCTION, uint32_t, const COIBUFFER*,
-                       const COI_ACCESS_FLAGS*, uint32_t, const COIEVENT*,
-                       const void*, uint16_t, void*, uint16_t, COIEVENT*))
-            DL_sym(lib_handle, "COIPipelineRunFunction", COI_VERSION1);
-    if (PipelineRunFunction == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPipelineRunFunction");
-        fini();
-        return false;
-    }
-
-    BufferCreate =
-        (COIRESULT (*)(uint64_t, COI_BUFFER_TYPE, uint32_t, const void*,
-                       uint32_t, const COIPROCESS*, COIBUFFER*))
-            DL_sym(lib_handle, "COIBufferCreate", COI_VERSION1);
-    if (BufferCreate == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferCreate");
-        fini();
-        return false;
-    }
-
-    BufferCreateFromMemory =
-        (COIRESULT (*)(uint64_t, COI_BUFFER_TYPE, uint32_t, void*,
-                       uint32_t, const COIPROCESS*, COIBUFFER*))
-            DL_sym(lib_handle, "COIBufferCreateFromMemory", COI_VERSION1);
-    if (BufferCreateFromMemory == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferCreateFromMemory");
-        fini();
-        return false;
-    }
-
-    BufferDestroy =
-        (COIRESULT (*)(COIBUFFER))
-            DL_sym(lib_handle, "COIBufferDestroy", COI_VERSION1);
-    if (BufferDestroy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferDestroy");
-        fini();
-        return false;
-    }
-
-    BufferMap =
-        (COIRESULT (*)(COIBUFFER, uint64_t, uint64_t, COI_MAP_TYPE, uint32_t,
-                       const COIEVENT*, COIEVENT*, COIMAPINSTANCE*,
-                       void**))
-            DL_sym(lib_handle, "COIBufferMap", COI_VERSION1);
-    if (BufferMap == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferMap");
-        fini();
-        return false;
-    }
-
-    BufferUnmap =
-        (COIRESULT (*)(COIMAPINSTANCE, uint32_t, const COIEVENT*,
-                       COIEVENT*))
-            DL_sym(lib_handle, "COIBufferUnmap", COI_VERSION1);
-    if (BufferUnmap == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferUnmap");
-        fini();
-        return false;
-    }
-
-    BufferWrite =
-        (COIRESULT (*)(COIBUFFER, uint64_t, const void*, uint64_t,
-                       COI_COPY_TYPE, uint32_t, const COIEVENT*,
-                       COIEVENT*))
-            DL_sym(lib_handle, "COIBufferWrite", COI_VERSION1);
-    if (BufferWrite == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferWrite");
-        fini();
-        return false;
-    }
-
-    BufferRead =
-        (COIRESULT (*)(COIBUFFER, uint64_t, void*, uint64_t,
-                                     COI_COPY_TYPE, uint32_t,
-                                     const COIEVENT*, COIEVENT*))
-            DL_sym(lib_handle, "COIBufferRead", COI_VERSION1);
-    if (BufferRead == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferRead");
-        fini();
-        return false;
-    }
-
-    BufferCopy =
-        (COIRESULT (*)(COIBUFFER, COIBUFFER, uint64_t, uint64_t, uint64_t,
-                       COI_COPY_TYPE, uint32_t, const COIEVENT*,
-                       COIEVENT*))
-            DL_sym(lib_handle, "COIBufferCopy", COI_VERSION1);
-    if (BufferCopy == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferCopy");
-        fini();
-        return false;
-    }
-
-    BufferGetSinkAddress =
-        (COIRESULT (*)(COIBUFFER, uint64_t*))
-            DL_sym(lib_handle, "COIBufferGetSinkAddress", COI_VERSION1);
-    if (BufferGetSinkAddress == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferGetSinkAddress");
-        fini();
-        return false;
-    }
-
-    BufferSetState =
-        (COIRESULT(*)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
-                      COI_BUFFER_MOVE_FLAG, uint32_t, const COIEVENT*,
-                      COIEVENT*))
-            DL_sym(lib_handle, "COIBufferSetState", COI_VERSION1);
-    if (BufferSetState == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIBufferSetState");
-        fini();
-        return false;
-    }
-
-    EventWait =
-        (COIRESULT (*)(uint16_t, const COIEVENT*, int32_t, uint8_t,
-                       uint32_t*, uint32_t*))
-            DL_sym(lib_handle, "COIEventWait", COI_VERSION1);
-    if (EventWait == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIEventWait");
-        fini();
-        return false;
-    }
-
-    PerfGetCycleFrequency =
-        (uint64_t (*)(void))
-            DL_sym(lib_handle, "COIPerfGetCycleFrequency", COI_VERSION1);
-    if (PerfGetCycleFrequency == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in COI library\n",
-                            "COIPerfGetCycleFrequency");
-        fini();
-        return false;
-    }
-
-    is_available = true;
-
-    return true;
-}
-
-void fini(void)
-{
-    is_available = false;
-
-    if (lib_handle != 0) {
-#ifndef TARGET_WINNT
-        DL_close(lib_handle);
-#endif // TARGET_WINNT
-        lib_handle = 0;
-    }
-}
-
-} // namespace COI
diff --git a/offload/src/coi/coi_client.h b/offload/src/coi/coi_client.h
deleted file mode 100644
index 4775a8bd0..000000000
--- a/offload/src/coi/coi_client.h
+++ /dev/null
@@ -1,118 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The interface between offload library and the COI API on the host
-
-#ifndef COI_CLIENT_H_INCLUDED
-#define COI_CLIENT_H_INCLUDED
-
-#include <common/COIPerf_common.h>
-#include <source/COIEngine_source.h>
-#include <source/COIProcess_source.h>
-#include <source/COIPipeline_source.h>
-#include <source/COIBuffer_source.h>
-#include <source/COIEvent_source.h>
-
-#include <string.h>
-
-#include "../liboffload_error_codes.h"
-#include "../offload_util.h"
-
-#define MIC_ENGINES_MAX     128
-
-#if MIC_ENGINES_MAX < COI_MAX_ISA_MIC_DEVICES
-#error MIC_ENGINES_MAX need to be increased
-#endif
-
-// COI library interface
-namespace COI {
-
-extern bool init(void);
-extern void fini(void);
-
-extern bool is_available;
-
-// pointers to functions from COI library
-extern COIRESULT (*EngineGetCount)(COI_ISA_TYPE, uint32_t*);
-extern COIRESULT (*EngineGetHandle)(COI_ISA_TYPE, uint32_t, COIENGINE*);
-
-extern COIRESULT (*ProcessCreateFromMemory)(COIENGINE, const char*,
-                                           const void*, uint64_t, int,
-                                           const char**, uint8_t,
-                                           const char**, uint8_t,
-                                           const char*, uint64_t,
-                                           const char*,
-                                           const char*, uint64_t,
-                                           COIPROCESS*);
-extern COIRESULT (*ProcessDestroy)(COIPROCESS, int32_t, uint8_t,
-                                  int8_t*, uint32_t*);
-extern COIRESULT (*ProcessGetFunctionHandles)(COIPROCESS, uint32_t,
-                                             const char**,
-                                             COIFUNCTION*);
-extern COIRESULT (*ProcessLoadLibraryFromMemory)(COIPROCESS,
-                                                const void*,
-                                                uint64_t,
-                                                const char*,
-                                                const char*,
-                                                const char*,
-                                                uint64_t,
-                                                uint32_t,
-                                                COILIBRARY*);
-extern COIRESULT (*ProcessRegisterLibraries)(uint32_t,
-                                            const void**,
-                                            const uint64_t*,
-                                            const char**,
-                                            const uint64_t*);
-
-extern COIRESULT (*PipelineCreate)(COIPROCESS, COI_CPU_MASK, uint32_t,
-                                  COIPIPELINE*);
-extern COIRESULT (*PipelineDestroy)(COIPIPELINE);
-extern COIRESULT (*PipelineRunFunction)(COIPIPELINE, COIFUNCTION,
-                                       uint32_t, const COIBUFFER*,
-                                       const COI_ACCESS_FLAGS*,
-                                       uint32_t, const COIEVENT*,
-                                       const void*, uint16_t, void*,
-                                       uint16_t, COIEVENT*);
-
-extern COIRESULT (*BufferCreate)(uint64_t, COI_BUFFER_TYPE, uint32_t,
-                                const void*, uint32_t,
-                                const COIPROCESS*, COIBUFFER*);
-extern COIRESULT (*BufferCreateFromMemory)(uint64_t, COI_BUFFER_TYPE,
-                                          uint32_t, void*,
-                                          uint32_t, const COIPROCESS*,
-                                          COIBUFFER*);
-extern COIRESULT (*BufferDestroy)(COIBUFFER);
-extern COIRESULT (*BufferMap)(COIBUFFER, uint64_t, uint64_t,
-                             COI_MAP_TYPE, uint32_t, const COIEVENT*,
-                             COIEVENT*, COIMAPINSTANCE*, void**);
-extern COIRESULT (*BufferUnmap)(COIMAPINSTANCE, uint32_t,
-                               const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferWrite)(COIBUFFER, uint64_t, const void*,
-                               uint64_t, COI_COPY_TYPE, uint32_t,
-                               const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferRead)(COIBUFFER, uint64_t, void*, uint64_t,
-                              COI_COPY_TYPE, uint32_t,
-                              const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferCopy)(COIBUFFER, COIBUFFER, uint64_t, uint64_t,
-                              uint64_t, COI_COPY_TYPE, uint32_t,
-                              const COIEVENT*, COIEVENT*);
-extern COIRESULT (*BufferGetSinkAddress)(COIBUFFER, uint64_t*);
-extern COIRESULT (*BufferSetState)(COIBUFFER, COIPROCESS, COI_BUFFER_STATE,
-                                   COI_BUFFER_MOVE_FLAG, uint32_t,
-                                   const   COIEVENT*, COIEVENT*);
-
-extern COIRESULT (*EventWait)(uint16_t, const COIEVENT*, int32_t,
-                           uint8_t, uint32_t*, uint32_t*);
-
-extern uint64_t  (*PerfGetCycleFrequency)(void);
-
-} // namespace COI
-
-#endif // COI_CLIENT_H_INCLUDED
diff --git a/offload/src/coi/coi_server.cpp b/offload/src/coi/coi_server.cpp
deleted file mode 100644
index 73e6c2dd4..000000000
--- a/offload/src/coi/coi_server.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The COI interface on the target
-
-#include "coi_server.h"
-
-#include "../offload_target.h"
-#include "../offload_timer.h"
-#ifdef MYO_SUPPORT
-#include "../offload_myo_target.h"      // for __offload_myoLibInit/Fini
-#endif // MYO_SUPPORT
-
-COINATIVELIBEXPORT
-void server_compute(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    OffloadDescriptor::offload(buffer_count, buffers,
-                               misc_data, misc_data_len,
-                               return_data, return_data_len);
-}
-
-COINATIVELIBEXPORT
-void server_init(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    struct init_data {
-        int  device_index;
-        int  devices_total;
-        int  console_level;
-        int  offload_report_level;
-    } *data = (struct init_data*) misc_data;
-
-    // set device index and number of total devices
-    mic_index = data->device_index;
-    mic_engines_total = data->devices_total;
-
-    // initialize trace level
-    console_enabled = data->console_level;
-    offload_report_level = data->offload_report_level;
-
-    // return back the process id
-    *((pid_t*) return_data) = getpid();
-}
-
-COINATIVELIBEXPORT
-void server_var_table_size(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    struct Params {
-        int64_t nelems;
-        int64_t length;
-    } *params;
-
-    params = static_cast<Params*>(return_data);
-    params->length = __offload_vars.table_size(params->nelems);
-}
-
-COINATIVELIBEXPORT
-void server_var_table_copy(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    __offload_vars.table_copy(buffers[0], *static_cast<int64_t*>(misc_data));
-}
-
-#ifdef MYO_SUPPORT
-// temporary workaround for blocking behavior of myoiLibInit/Fini calls
-COINATIVELIBEXPORT
-void server_myoinit(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    __offload_myoLibInit();
-}
-
-COINATIVELIBEXPORT
-void server_myofini(
-    uint32_t  buffer_count,
-    void**    buffers,
-    uint64_t* buffers_len,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    __offload_myoLibFini();
-}
-#endif // MYO_SUPPORT
diff --git a/offload/src/coi/coi_server.h b/offload/src/coi/coi_server.h
deleted file mode 100644
index e744d9e75..000000000
--- a/offload/src/coi/coi_server.h
+++ /dev/null
@@ -1,74 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-//The interface between offload library and the COI API on the target.
-
-#ifndef COI_SERVER_H_INCLUDED
-#define COI_SERVER_H_INCLUDED
-
-#include <common/COIEngine_common.h>
-#include <common/COIPerf_common.h>
-#include <sink/COIProcess_sink.h>
-#include <sink/COIPipeline_sink.h>
-#include <sink/COIBuffer_sink.h>
-#include <list>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include "../liboffload_error_codes.h"
-
-// wrappers for COI API
-#define PipelineStartExecutingRunFunctions() \
-    { \
-        COIRESULT res = COIPipelineStartExecutingRunFunctions(); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_pipeline_start_run_funcs, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define ProcessWaitForShutdown() \
-    { \
-        COIRESULT res = COIProcessWaitForShutdown(); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_process_wait_shutdown, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define BufferAddRef(buf) \
-    { \
-        COIRESULT res = COIBufferAddRef(buf); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_buf_add_ref, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define BufferReleaseRef(buf) \
-    { \
-        COIRESULT res = COIBufferReleaseRef(buf); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_buf_release_ref, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#define EngineGetIndex(index) \
-    { \
-        COI_ISA_TYPE isa_type; \
-        COIRESULT res = COIEngineGetIndex(&isa_type, index); \
-        if (res != COI_SUCCESS) { \
-            LIBOFFLOAD_ERROR(c_get_engine_index, mic_index, res); \
-            exit(1); \
-        } \
-    }
-
-#endif // COI_SERVER_H_INCLUDED
diff --git a/offload/src/compiler_if_host.cpp b/offload/src/compiler_if_host.cpp
deleted file mode 100644
index 2bc430b32..000000000
--- a/offload/src/compiler_if_host.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "compiler_if_host.h"
-
-#include <malloc.h>
-#ifndef TARGET_WINNT
-#include <alloca.h>
-#endif // TARGET_WINNT
-
-// Global counter on host. 
-// This variable is used if P2OPT_offload_do_data_persistence == 2.
-// The variable used to identify offload constructs contained in one procedure.
-// Increment of OFFLOAD_CALL_COUNT is inserted at entries of HOST routines with
-// offload constructs.
-static int offload_call_count = 0;
-
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE(
-    TARGET_TYPE      target_type,
-    int              target_number,
-    int              is_optional,
-    _Offload_status* status,
-    const char*      file,
-    uint64_t         line
-)
-{
-    bool retval;
-    OFFLOAD ofld;
-
-    // initialize status
-    if (status != 0) {
-        status->result = OFFLOAD_UNAVAILABLE;
-        status->device_number = -1;
-        status->data_sent = 0;
-        status->data_received = 0;
-    }
-
-    // make sure libray is initialized
-    retval = __offload_init_library();
-
-    // OFFLOAD_TIMER_INIT must follow call to __offload_init_library
-    OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-
-    // initialize all devices is init_type is on_offload_all
-    if (retval && __offload_init_type == c_init_on_offload_all) {
-        for (int i = 0; i < mic_engines_total; i++) {
-             mic_engines[i].init();
-        }
-    }
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire);
-
-    if (target_type == TARGET_HOST) {
-        // Host always available
-        retval = true;
-    }
-    else if (target_type == TARGET_MIC) {
-        if (target_number >= -1) {
-            if (retval) {
-                if (target_number >= 0) {
-                    // User provided the device number
-                    target_number = target_number % mic_engines_total;
-                }
-                else {
-                    // use device 0
-                    target_number = 0;
-                }
-
-                // reserve device in ORSL
-                if (is_optional) {
-                    if (!ORSL::try_reserve(target_number)) {
-                        target_number = -1;
-                    }
-                }
-                else {
-                    if (!ORSL::reserve(target_number)) {
-                        target_number = -1;
-                    }
-                }
-
-                // initialize device
-                if (target_number >= 0 &&
-                    __offload_init_type == c_init_on_offload) {
-                    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-                    mic_engines[target_number].init();
-                    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-                }
-            }
-            else {
-                // fallback to CPU
-                target_number = -1;
-            }
-
-            if (target_number < 0 || !retval) {
-                if (!is_optional && status == 0) {
-                    LIBOFFLOAD_ERROR(c_device_is_not_available);
-                    exit(1);
-                }
-
-                retval = false;
-            }
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_device_number);
-            exit(1);
-        }
-    }
-
-    if (retval) {
-        ofld = new OffloadDescriptor(target_number, status,
-                                     !is_optional, false, timer_data);
-        OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number);
-        Offload_Report_Prolog(timer_data);
-        OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start,
-                              "Starting offload: target_type = %d, "
-                              "number = %d, is_optional = %d\n",
-                              target_type, target_number, is_optional);
-
-        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
-    }
-    else {
-        ofld = NULL;
-
-        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
-        OFFLOAD_TIMER_STOP(timer_data, c_offload_host_total_offload);
-        offload_report_free_data(timer_data);
-    }
-
-    return ofld;
-}
-
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1(
-    const int*  device_num,
-    const char* file,
-    uint64_t    line
-)
-{
-    int target_number;
-
-    // make sure libray is initialized and at least one device is available
-    if (!__offload_init_library()) {
-        LIBOFFLOAD_ERROR(c_device_is_not_available);
-        exit(1);
-    }
-
-    // OFFLOAD_TIMER_INIT must follow call to __offload_init_library
-
-    OffloadHostTimerData * timer_data = OFFLOAD_TIMER_INIT(file, line);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_total_offload);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-
-    if (__offload_init_type == c_init_on_offload_all) {
-        for (int i = 0; i < mic_engines_total; i++) {
-             mic_engines[i].init();
-        }
-    }
-
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_target_acquire);
-
-    // use default device number if it is not provided
-    if (device_num != 0) {
-        target_number = *device_num;
-    }
-    else {
-        target_number = __omp_device_num;
-    }
-
-    // device number should be a non-negative integer value
-    if (target_number < 0) {
-        LIBOFFLOAD_ERROR(c_omp_invalid_device_num);
-        exit(1);
-    }
-
-    // should we do this for OpenMP?
-    target_number %= mic_engines_total;
-
-    // reserve device in ORSL
-    if (!ORSL::reserve(target_number)) {
-        LIBOFFLOAD_ERROR(c_device_is_not_available);
-        exit(1);
-    }
-
-    // initialize device(s)
-    OFFLOAD_TIMER_START(timer_data, c_offload_host_initialize);
-
-    if (__offload_init_type == c_init_on_offload) {
-        mic_engines[target_number].init();
-    }
-
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_initialize);
-
-    OFFLOAD ofld =
-        new OffloadDescriptor(target_number, 0, true, true, timer_data);
-
-    OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, target_number);
-
-    Offload_Report_Prolog(timer_data);
-
-    OFFLOAD_DEBUG_TRACE_1(2, timer_data->offload_number, c_offload_start,
-                          "Starting OpenMP offload, device = %d\n",
-                          target_number);
-
-    OFFLOAD_TIMER_STOP(timer_data, c_offload_host_target_acquire);
-
-    return ofld;
-}
-
-int offload_offload_wrap(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void **waits,
-    const void **signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    bool ret = ofld->offload(name, is_empty, vars, vars2, num_vars,
-                             waits, num_waits, signal, entry_id, stack_addr);
-    if (!ret || signal == 0) {
-        delete ofld;
-    }
-    return ret;
-}
-
-extern "C" int OFFLOAD_OFFLOAD1(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void **waits,
-    const void **signal
-)
-{
-    return offload_offload_wrap(ofld, name, is_empty,
-                            num_vars, vars, vars2,
-                            num_waits, waits,
-                            signal, NULL, NULL);
-}
-
-extern "C" int OFFLOAD_OFFLOAD2(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void** signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    return offload_offload_wrap(ofld, name, is_empty,
-                            num_vars, vars, vars2,
-                            num_waits, waits,
-                            signal, entry_id, stack_addr);
-}
-
-extern "C" int OFFLOAD_OFFLOAD(
-    OFFLOAD ofld,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void **waits,
-    const void *signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    // signal is passed by reference now
-    const void **signal_new = (signal != 0) ? &signal : 0;
-    const void **waits_new = 0;
-    int num_waits_new = 0;
-
-    // remove NULL values from the list of signals to wait for
-    if (num_waits > 0) {
-        waits_new = (const void**) alloca(sizeof(void*) * num_waits);
-        for (int i = 0; i < num_waits; i++) {
-            if (waits[i] != 0) {
-                waits_new[num_waits_new++] = waits[i];
-            }
-        }
-    }
-
-    return OFFLOAD_OFFLOAD1(ofld, name, is_empty,
-                            num_vars, vars, vars2,
-                            num_waits_new, waits_new,
-                            signal_new);
-}
-
-extern "C" int OFFLOAD_CALL_COUNT()
-{
-    offload_call_count++;
-    return offload_call_count;
-}
diff --git a/offload/src/compiler_if_host.h b/offload/src/compiler_if_host.h
deleted file mode 100644
index 4b34c51af..000000000
--- a/offload/src/compiler_if_host.h
+++ /dev/null
@@ -1,133 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The interface between compiler-generated host code and runtime library
-*/
-
-#ifndef COMPILER_IF_HOST_H_INCLUDED
-#define COMPILER_IF_HOST_H_INCLUDED
-
-#include "offload_host.h"
-
-#define OFFLOAD_TARGET_ACQUIRE          OFFLOAD_PREFIX(target_acquire)
-#define OFFLOAD_TARGET_ACQUIRE1         OFFLOAD_PREFIX(target_acquire1)
-#define OFFLOAD_OFFLOAD                 OFFLOAD_PREFIX(offload)
-#define OFFLOAD_OFFLOAD1                OFFLOAD_PREFIX(offload1)
-#define OFFLOAD_OFFLOAD2                OFFLOAD_PREFIX(offload2)
-#define OFFLOAD_CALL_COUNT              OFFLOAD_PREFIX(offload_call_count)
-
-
-/*! \fn OFFLOAD_TARGET_ACQUIRE
-    \brief Attempt to acquire the target.
-    \param target_type   The type of target.
-    \param target_number The device number.
-    \param is_optional   Whether CPU fall-back is allowed.
-    \param status        Address of variable to hold offload status.
-    \param file          Filename in which this offload occurred.
-    \param line          Line number in the file where this offload occurred.
-*/
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE(
-    TARGET_TYPE      target_type,
-    int              target_number,
-    int              is_optional,
-    _Offload_status* status,
-    const char*      file,
-    uint64_t         line
-);
-
-/*! \fn OFFLOAD_TARGET_ACQUIRE1
-    \brief Acquire the target for offload (OpenMP).
-    \param device_number Device number or null if not specified.
-    \param file          Filename in which this offload occurred
-    \param line          Line number in the file where this offload occurred.
-*/
-extern "C" OFFLOAD OFFLOAD_TARGET_ACQUIRE1(
-    const int*      device_number,
-    const char*     file,
-    uint64_t        line
-);
-
-/*! \fn OFFLOAD_OFFLOAD1
-    \brief Run function on target using interface for old data persistence.
-    \param o Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
-    \param name Name of offload entry point.
-    \param is_empty If no code to execute (e.g. offload_transfer)
-    \param num_vars Number of variable descriptors.
-    \param vars Pointer to VarDesc array.
-    \param vars2 Pointer to VarDesc2 array.
-    \param num_waits Number of "wait" values.
-    \param waits Pointer to array of wait values.
-    \param signal Pointer to signal value or NULL.
-*/
-extern "C" int OFFLOAD_OFFLOAD1(
-    OFFLOAD o,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void** signal
-);
-
-/*! \fn OFFLOAD_OFFLOAD2
-    \brief Run function on target using interface for new data persistence.
-    \param o Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
-    \param name Name of offload entry point.
-    \param is_empty If no code to execute (e.g. offload_transfer)
-    \param num_vars Number of variable descriptors.
-    \param vars Pointer to VarDesc array.
-    \param vars2 Pointer to VarDesc2 array.
-    \param num_waits Number of "wait" values.
-    \param waits Pointer to array of wait values.
-    \param signal Pointer to signal value or NULL.
-    \param entry_id A signature for the function doing the offload.
-    \param stack_addr The stack frame address of the function doing offload.
-*/
-extern "C" int OFFLOAD_OFFLOAD2(
-    OFFLOAD o,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void** signal,
-    int entry_id,
-    const void *stack_addr
-);
-
-// Run function on target (obsolete).
-// @param o    OFFLOAD object
-// @param name function name
-extern "C" int OFFLOAD_OFFLOAD(
-    OFFLOAD o,
-    const char *name,
-    int is_empty,
-    int num_vars,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int num_waits,
-    const void** waits,
-    const void* signal,
-    int entry_id = 0,
-    const void *stack_addr = NULL
-);
-
-// Global counter on host.
-// This variable is used if P2OPT_offload_do_data_persistence == 2.
-// The variable used to identify offload constructs contained in one procedure.
-// Call to OFFLOAD_CALL_COUNT() is inserted at HOST on entry of the routine.
-extern "C" int  OFFLOAD_CALL_COUNT();
-
-#endif // COMPILER_IF_HOST_H_INCLUDED
diff --git a/offload/src/compiler_if_target.cpp b/offload/src/compiler_if_target.cpp
deleted file mode 100644
index 1af82b80a..000000000
--- a/offload/src/compiler_if_target.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "compiler_if_target.h"
-
-extern "C" void OFFLOAD_TARGET_ENTER(
-    OFFLOAD ofld,
-    int vars_total,
-    VarDesc *vars,
-    VarDesc2 *vars2
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p, %d, %p, %p)\n", __func__, ofld,
-                        vars_total, vars, vars2);
-    ofld->merge_var_descs(vars, vars2, vars_total);
-    ofld->scatter_copyin_data();
-}
-
-extern "C" void OFFLOAD_TARGET_LEAVE(
-    OFFLOAD ofld
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ofld);
-    ofld->gather_copyout_data();
-}
-
-extern "C" void OFFLOAD_TARGET_MAIN(void)
-{
-    // initialize target part
-    __offload_target_init();
-
-    // pass control to COI
-    PipelineStartExecutingRunFunctions();
-    ProcessWaitForShutdown();
-
-    OFFLOAD_DEBUG_TRACE(2, "Exiting main...\n");
-}
diff --git a/offload/src/compiler_if_target.h b/offload/src/compiler_if_target.h
deleted file mode 100644
index 49d2c1c60..000000000
--- a/offload/src/compiler_if_target.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The interface between compiler-generated target code and runtime library
-*/
-
-#ifndef COMPILER_IF_TARGET_H_INCLUDED
-#define COMPILER_IF_TARGET_H_INCLUDED
-
-#include "offload_target.h"
-
-#define OFFLOAD_TARGET_ENTER            OFFLOAD_PREFIX(target_enter)
-#define OFFLOAD_TARGET_LEAVE            OFFLOAD_PREFIX(target_leave)
-#define OFFLOAD_TARGET_MAIN             OFFLOAD_PREFIX(target_main)
-
-/*! \fn OFFLOAD_TARGET_ENTER
-    \brief Fill in variable addresses using VarDesc array.
-    \brief Then call back the runtime library to fetch data.
-    \param ofld         Offload descriptor created by runtime.
-    \param var_desc_num Number of variable descriptors.
-    \param var_desc     Pointer to VarDesc array.
-    \param var_desc2    Pointer to VarDesc2 array.
-*/
-extern "C" void OFFLOAD_TARGET_ENTER(
-    OFFLOAD ofld,
-    int var_desc_num,
-    VarDesc *var_desc,
-    VarDesc2 *var_desc2
-);
-
-/*! \fn OFFLOAD_TARGET_LEAVE
-    \brief Call back the runtime library to gather outputs using VarDesc array.
-    \param ofld Offload descriptor created by OFFLOAD_TARGET_ACQUIRE.
-*/
-extern "C" void OFFLOAD_TARGET_LEAVE(
-    OFFLOAD ofld
-);
-
-// Entry point for the target application.
-extern "C" void OFFLOAD_TARGET_MAIN(void);
-
-#endif // COMPILER_IF_TARGET_H_INCLUDED
diff --git a/offload/src/dv_util.cpp b/offload/src/dv_util.cpp
deleted file mode 100644
index 4ad727166..000000000
--- a/offload/src/dv_util.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_common.h"
-
-bool __dv_is_contiguous(const ArrDesc *dvp)
-{
-    if (dvp->Flags & ArrDescFlagsContiguous) {
-        return true;
-    }
-
-    if (dvp->Rank != 0) {
-        if (dvp->Dim[0].Mult != dvp->Len) {
-            return false;
-        }
-        for (int i = 1; i < dvp->Rank; i++) {
-            if (dvp->Dim[i].Mult !=
-                dvp->Dim[i-1].Extent * dvp->Dim[i-1].Mult) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool __dv_is_allocated(const ArrDesc *dvp)
-{
-    return (dvp->Flags & ArrDescFlagsDefined);
-}
-
-uint64_t __dv_data_length(const ArrDesc *dvp)
-{
-    uint64_t size;
-
-    if (dvp->Rank == 0) {
-        size = dvp->Len;
-        return size;
-    }
-
-    size = dvp->Len;
-    for (int i = 0; i < dvp->Rank; ++i) {
-        size += (dvp->Dim[i].Extent-1) * dvp->Dim[i].Mult;
-    }
-    return size;
-}
-
-uint64_t __dv_data_length(const ArrDesc *dvp, int64_t count)
-{
-    if (dvp->Rank == 0) {
-        return count;
-    }
-
-    return count * dvp->Dim[0].Mult;
-}
-
-// Create CeanReadRanges data for reading contiguous ranges of
-// noncontiguous array defined by the argument
-CeanReadRanges * init_read_ranges_dv(const ArrDesc *dvp)
-{
-    int64_t         len;
-    int             count;
-    int             rank = dvp->Rank;
-    CeanReadRanges *res = NULL;
-
-    if (rank != 0) {
-        int i = 0;
-        len = dvp->Len;
-        if (dvp->Dim[0].Mult == len) {
-            for (i = 1; i < rank; i++) {
-                len *= dvp->Dim[i-1].Extent;
-                if (dvp->Dim[i].Mult != len) {
-                    break;
-                }
-            }
-        }
-        res = (CeanReadRanges *)malloc(
-            sizeof(CeanReadRanges) + (rank - i) * sizeof(CeanReadDim));
-        res -> last_noncont_ind = rank - i - 1;
-        count = 1;
-        for (; i < rank; i++) {
-            res->Dim[rank - i - 1].count = count;
-            res->Dim[rank - i - 1].size = dvp->Dim[i].Mult;
-            count *= dvp->Dim[i].Extent;
-        }
-        res -> range_max_number = count;
-        res -> range_size = len;
-        res -> ptr = (void*)dvp->Base;
-        res -> current_number = 0;
-        res -> init_offset = 0;
-    }
-    return res;
-}
-
-#if OFFLOAD_DEBUG > 0
-void __dv_desc_dump(const char *name, const ArrDesc *dvp)
-{
-    OFFLOAD_TRACE(3, "%s DV %p\n", name, dvp);
-
-    if (dvp != 0) {
-        OFFLOAD_TRACE(3,
-                      "    dv->Base   = 0x%lx\n"
-                      "    dv->Len    = 0x%lx\n"
-                      "    dv->Offset = 0x%lx\n"
-                      "    dv->Flags  = 0x%lx\n"
-                      "    dv->Rank   = 0x%lx\n"
-                      "    dv->Resrvd = 0x%lx\n",
-                      dvp->Base,
-                      dvp->Len,
-                      dvp->Offset,
-                      dvp->Flags,
-                      dvp->Rank,
-                      dvp->Reserved);
-
-        for (int i = 0 ; i < dvp->Rank; i++) {
-            OFFLOAD_TRACE(3,
-                          "    (%d) Extent=%ld, Multiplier=%ld, LowerBound=%ld\n",
-                          i,
-                          dvp->Dim[i].Extent,
-                          dvp->Dim[i].Mult,
-                          dvp->Dim[i].LowerBound);
-        }
-    }
-}
-#endif // OFFLOAD_DEBUG > 0
diff --git a/offload/src/dv_util.h b/offload/src/dv_util.h
deleted file mode 100644
index fdfa77d74..000000000
--- a/offload/src/dv_util.h
+++ /dev/null
@@ -1,63 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef DV_UTIL_H_INCLUDED
-#define DV_UTIL_H_INCLUDED
-
-#include <stdint.h>
-
-// Dope vector declarations
-#define ArrDescMaxArrayRank         31
-
-// Dope vector flags
-#define ArrDescFlagsDefined         1
-#define ArrDescFlagsNodealloc       2
-#define ArrDescFlagsContiguous      4
-
-typedef int64_t dv_size;
-
-typedef struct DimDesc {
-    dv_size        Extent;      // Number of elements in this dimension
-    dv_size        Mult;        // Multiplier for this dimension.
-                                // The number of bytes between successive
-                                // elements in this dimension.
-    dv_size        LowerBound;  // LowerBound of this dimension
-} DimDesc ;
-
-typedef struct ArrDesc {
-    dv_size        Base;        // Base address
-    dv_size        Len;         // Length of data type, used only for
-                                // character strings.
-    dv_size        Offset;
-    dv_size        Flags;       // Flags
-    dv_size        Rank;        // Rank of pointer
-    dv_size        Reserved;    // reserved for openmp requests
-    DimDesc Dim[ArrDescMaxArrayRank];
-} ArrDesc ;
-
-typedef ArrDesc* pArrDesc;
-
-bool __dv_is_contiguous(const ArrDesc *dvp);
-
-bool __dv_is_allocated(const ArrDesc *dvp);
-
-uint64_t __dv_data_length(const ArrDesc *dvp);
-
-uint64_t __dv_data_length(const ArrDesc *dvp, int64_t nelems);
-
-CeanReadRanges * init_read_ranges_dv(const ArrDesc *dvp);
-
-#if OFFLOAD_DEBUG > 0
-void    __dv_desc_dump(const char *name, const ArrDesc *dvp);
-#else // OFFLOAD_DEBUG
-#define __dv_desc_dump(name, dvp)
-#endif // OFFLOAD_DEBUG
-
-#endif // DV_UTIL_H_INCLUDED
diff --git a/offload/src/liboffload_error.c b/offload/src/liboffload_error.c
deleted file mode 100644
index fc15f8b00..000000000
--- a/offload/src/liboffload_error.c
+++ /dev/null
@@ -1,452 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <stdio.h>
-#include <stdarg.h>
-#ifndef va_copy
-#define va_copy(dst, src) ((dst) = (src))
-#endif
-
-#include "liboffload_msg.h"
-
-#include "liboffload_error_codes.h"
-
-/***********************************************/
-/* error-handling function, liboffload_error_support */
-/***********************************************/
-
-void __liboffload_error_support(error_types input_tag, ...)
-{
-    va_list args;
-    va_start(args, input_tag);
-
-    switch (input_tag) {
-        case c_device_is_not_available:
-            write_message(stderr, msg_c_device_is_not_available, args);
-            break;
-        case c_invalid_device_number:
-            write_message(stderr, msg_c_invalid_device_number, args);
-            break;
-        case c_send_func_ptr:
-            write_message(stderr, msg_c_send_func_ptr, args);
-            break;
-        case c_receive_func_ptr:
-            write_message(stderr, msg_c_receive_func_ptr, args);
-            break;
-        case c_offload_malloc:
-            write_message(stderr, msg_c_offload_malloc, args);
-            break;
-        case c_offload1:
-            write_message(stderr, msg_c_offload1, args);
-            break;
-        case c_unknown_var_type:
-            write_message(stderr, c_unknown_var_type, args);
-            break;
-        case c_invalid_env_var_value:
-            write_message(stderr, msg_c_invalid_env_var_value, args);
-            break;
-        case c_invalid_env_var_int_value:
-            write_message(stderr, msg_c_invalid_env_var_int_value, args);
-            break;
-        case c_invalid_env_report_value:
-            write_message(stderr, msg_c_invalid_env_report_value, args);
-            break;
-        case c_offload_signaled1:
-            write_message(stderr, msg_c_offload_signaled1, args);
-            break;
-        case c_offload_signaled2:
-            write_message(stderr, msg_c_offload_signaled2, args);
-            break;
-        case c_myowrapper_checkresult:
-            write_message(stderr, msg_c_myowrapper_checkresult, args);
-            break;
-        case c_myotarget_checkresult:
-            write_message(stderr, msg_c_myotarget_checkresult, args);
-            break;
-        case c_offload_descriptor_offload:
-            write_message(stderr, msg_c_offload_descriptor_offload, args);
-            break;
-        case c_merge_var_descs1:
-            write_message(stderr, msg_c_merge_var_descs1, args);
-            break;
-        case c_merge_var_descs2:
-            write_message(stderr, msg_c_merge_var_descs2, args);
-            break;
-        case c_mic_parse_env_var_list1:
-            write_message(stderr, msg_c_mic_parse_env_var_list1, args);
-            break;
-        case c_mic_parse_env_var_list2:
-            write_message(stderr, msg_c_mic_parse_env_var_list2, args);
-            break;
-        case c_mic_process_exit_ret:
-            write_message(stderr, msg_c_mic_process_exit_ret, args);
-            break;
-        case c_mic_process_exit_sig:
-            write_message(stderr, msg_c_mic_process_exit_sig, args);
-            break;
-        case c_mic_process_exit:
-            write_message(stderr, msg_c_mic_process_exit, args);
-            break;
-        case c_mic_init3:
-            write_message(stderr, msg_c_mic_init3, args);
-            break;
-        case c_mic_init4:
-            write_message(stderr, msg_c_mic_init4, args);
-            break;
-        case c_mic_init5:
-            write_message(stderr, msg_c_mic_init5, args);
-            break;
-        case c_mic_init6:
-            write_message(stderr, msg_c_mic_init6, args);
-            break;
-        case c_no_static_var_data:
-            write_message(stderr, msg_c_no_static_var_data, args);
-            break;
-        case c_no_ptr_data:
-            write_message(stderr, msg_c_no_ptr_data, args);
-            break;
-        case c_get_engine_handle:
-            write_message(stderr, msg_c_get_engine_handle, args);
-            break;
-        case c_get_engine_index:
-            write_message(stderr, msg_c_get_engine_index, args);
-            break;
-        case c_process_create:
-            write_message(stderr, msg_c_process_create, args);
-            break;
-        case c_process_wait_shutdown:
-            write_message(stderr, msg_c_process_wait_shutdown, args);
-            break;
-        case c_process_proxy_flush:
-            write_message(stderr, msg_c_process_proxy_flush, args);
-            break;
-        case c_process_get_func_handles:
-            write_message(stderr, msg_c_process_get_func_handles, args);
-            break;
-        case c_load_library:
-            write_message(stderr, msg_c_load_library, args);
-            break;
-        case c_coipipe_max_number:
-            write_message(stderr, msg_c_coi_pipeline_max_number, args);
-            break;
-        case c_pipeline_create:
-            write_message(stderr, msg_c_pipeline_create, args);
-            break;
-        case c_pipeline_run_func:
-            write_message(stderr, msg_c_pipeline_run_func, args);
-            break;
-        case c_pipeline_start_run_funcs:
-            write_message(stderr, msg_c_pipeline_start_run_funcs, args);
-            break;
-        case c_buf_create:
-            write_message(stderr, msg_c_buf_create, args);
-            break;
-        case c_buf_create_out_of_mem:
-            write_message(stderr, msg_c_buf_create_out_of_mem, args);
-            break;
-        case c_buf_create_from_mem:
-            write_message(stderr, msg_c_buf_create_from_mem, args);
-            break;
-        case c_buf_destroy:
-            write_message(stderr, msg_c_buf_destroy, args);
-            break;
-        case c_buf_map:
-            write_message(stderr, msg_c_buf_map, args);
-            break;
-        case c_buf_unmap:
-            write_message(stderr, msg_c_buf_unmap, args);
-            break;
-        case c_buf_read:
-            write_message(stderr, msg_c_buf_read, args);
-            break;
-        case c_buf_write:
-            write_message(stderr, msg_c_buf_write, args);
-            break;
-        case c_buf_copy:
-            write_message(stderr, msg_c_buf_copy, args);
-            break;
-        case c_buf_get_address:
-            write_message(stderr, msg_c_buf_get_address, args);
-            break;
-        case c_buf_add_ref:
-            write_message(stderr, msg_c_buf_add_ref, args);
-            break;
-        case c_buf_release_ref:
-            write_message(stderr, msg_c_buf_release_ref, args);
-            break;
-        case c_buf_set_state:
-            write_message(stderr, msg_c_buf_set_state, args);
-            break;
-        case c_event_wait:
-            write_message(stderr, msg_c_event_wait, args);
-            break;
-        case c_zero_or_neg_ptr_len:
-            write_message(stderr, msg_c_zero_or_neg_ptr_len, args);
-            break;
-        case c_zero_or_neg_transfer_size:
-            write_message(stderr, msg_c_zero_or_neg_transfer_size, args);
-            break;
-        case c_bad_ptr_mem_range:
-            write_message(stderr, msg_c_bad_ptr_mem_range, args);
-            break;
-        case c_different_src_and_dstn_sizes:
-            write_message(stderr, msg_c_different_src_and_dstn_sizes, args);
-            break;
-        case c_ranges_dont_match:
-            write_message(stderr, msg_c_ranges_dont_match, args);
-            break;
-        case c_destination_is_over:
-            write_message(stderr, msg_c_destination_is_over, args);
-            break;
-        case c_slice_of_noncont_array:
-            write_message(stderr, msg_c_slice_of_noncont_array, args);
-            break;
-        case c_non_contiguous_dope_vector:
-            write_message(stderr, msg_c_non_contiguous_dope_vector, args);
-            break;
-        case c_pointer_array_mismatch:
-            write_message(stderr, msg_c_pointer_array_mismatch, args);
-            break;
-        case c_omp_invalid_device_num_env:
-            write_message(stderr, msg_c_omp_invalid_device_num_env, args);
-            break;
-        case c_omp_invalid_device_num:
-            write_message(stderr, msg_c_omp_invalid_device_num, args);
-            break;
-        case c_unknown_binary_type:
-            write_message(stderr, msg_c_unknown_binary_type, args);
-            break;
-        case c_multiple_target_exes:
-            write_message(stderr, msg_c_multiple_target_exes, args);
-            break;
-        case c_no_target_exe:
-            write_message(stderr, msg_c_no_target_exe, args);
-            break;
-        case c_report_unknown_timer_node:
-            write_message(stderr, msg_c_report_unknown_timer_node, args);
-            break;
-        case c_report_unknown_trace_node:
-            write_message(stderr, msg_c_report_unknown_trace_node, args);
-            break;
-    }
-    va_end(args);
-}
-
-char const * report_get_message_str(error_types input_tag)
-{
-    switch (input_tag) {
-        case c_report_title:
-            return (offload_get_message_str(msg_c_report_title));
-        case c_report_from_file:
-            return (offload_get_message_str(msg_c_report_from_file));
-        case c_report_offload:
-            return (offload_get_message_str(msg_c_report_offload));
-        case c_report_mic:
-            return (offload_get_message_str(msg_c_report_mic));
-        case c_report_file:
-            return (offload_get_message_str(msg_c_report_file));
-        case c_report_line:
-            return (offload_get_message_str(msg_c_report_line));
-        case c_report_host:
-            return (offload_get_message_str(msg_c_report_host));
-        case c_report_tag:
-            return (offload_get_message_str(msg_c_report_tag));
-        case c_report_cpu_time:
-            return (offload_get_message_str(msg_c_report_cpu_time));
-        case c_report_seconds:
-            return (offload_get_message_str(msg_c_report_seconds));
-        case c_report_cpu_to_mic_data:
-            return (offload_get_message_str(msg_c_report_cpu_to_mic_data));
-        case c_report_bytes:
-            return (offload_get_message_str(msg_c_report_bytes));
-        case c_report_mic_time:
-            return (offload_get_message_str(msg_c_report_mic_time));
-        case c_report_mic_to_cpu_data:
-            return (offload_get_message_str(msg_c_report_mic_to_cpu_data));
-        case c_report_compute:
-            return (offload_get_message_str(msg_c_report_compute));
-        case c_report_copyin_data:
-            return (offload_get_message_str(msg_c_report_copyin_data));
-        case c_report_copyout_data:
-            return (offload_get_message_str(msg_c_report_copyout_data));
-        case c_report_create_buf_host:
-            return (offload_get_message_str(c_report_create_buf_host));
-        case c_report_create_buf_mic:
-            return (offload_get_message_str(msg_c_report_create_buf_mic));
-        case c_report_destroy:
-            return (offload_get_message_str(msg_c_report_destroy));
-        case c_report_gather_copyin_data:
-            return (offload_get_message_str(msg_c_report_gather_copyin_data));
-        case c_report_gather_copyout_data:
-            return (offload_get_message_str(msg_c_report_gather_copyout_data));
-        case c_report_state_signal:
-            return (offload_get_message_str(msg_c_report_state_signal));
-        case c_report_signal:
-            return (offload_get_message_str(msg_c_report_signal));
-        case c_report_wait:
-            return (offload_get_message_str(msg_c_report_wait));
-        case c_report_init:
-            return (offload_get_message_str(msg_c_report_init));
-        case c_report_init_func:
-            return (offload_get_message_str(msg_c_report_init_func));
-        case c_report_logical_card:
-            return (offload_get_message_str(msg_c_report_logical_card));
-        case c_report_mic_myo_fptr:
-            return (offload_get_message_str(msg_c_report_mic_myo_fptr));
-        case c_report_mic_myo_shared:
-            return (offload_get_message_str(msg_c_report_mic_myo_shared));
-        case c_report_myoacquire:
-            return (offload_get_message_str(msg_c_report_myoacquire));
-        case c_report_myofini:
-            return (offload_get_message_str(msg_c_report_myofini));
-        case c_report_myoinit:
-            return (offload_get_message_str(msg_c_report_myoinit));
-        case c_report_myoregister:
-            return (offload_get_message_str(msg_c_report_myoregister));
-        case c_report_myorelease:
-            return (offload_get_message_str(msg_c_report_myorelease));
-        case c_report_myosharedalignedfree:
-            return (
-                offload_get_message_str(msg_c_report_myosharedalignedfree));
-        case c_report_myosharedalignedmalloc:
-            return (
-                offload_get_message_str(msg_c_report_myosharedalignedmalloc));
-        case c_report_myosharedfree:
-            return (offload_get_message_str(msg_c_report_myosharedfree));
-        case c_report_myosharedmalloc:
-            return (offload_get_message_str(msg_c_report_myosharedmalloc));
-        case c_report_physical_card:
-            return (offload_get_message_str(msg_c_report_physical_card));
-        case c_report_receive_pointer_data:
-            return (
-                offload_get_message_str(msg_c_report_receive_pointer_data));
-        case c_report_received_pointer_data:
-            return (
-                offload_get_message_str(msg_c_report_received_pointer_data));
-        case c_report_register:
-            return (offload_get_message_str(msg_c_report_register));
-        case c_report_scatter_copyin_data:
-            return (offload_get_message_str(msg_c_report_scatter_copyin_data));
-        case c_report_scatter_copyout_data:
-            return (
-                offload_get_message_str(msg_c_report_scatter_copyout_data));
-        case c_report_send_pointer_data:
-            return (offload_get_message_str(msg_c_report_send_pointer_data));
-        case c_report_sent_pointer_data:
-            return (offload_get_message_str(msg_c_report_sent_pointer_data));
-        case c_report_start:
-            return (offload_get_message_str(msg_c_report_start));
-        case c_report_start_target_func:
-            return (offload_get_message_str(msg_c_report_start_target_func));
-        case c_report_state:
-            return (offload_get_message_str(msg_c_report_state));
-        case c_report_unregister:
-            return (offload_get_message_str(msg_c_report_unregister));
-        case c_report_var:
-            return (offload_get_message_str(msg_c_report_var));
-
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_trace_node);
-            abort();
-    }
-}
-
-char const * report_get_host_stage_str(int i)
-{
-    switch (i) {
-        case c_offload_host_total_offload:
-            return (
-               offload_get_message_str(msg_c_report_host_total_offload_time));
-        case c_offload_host_initialize:
-            return (offload_get_message_str(msg_c_report_host_initialize));
-        case c_offload_host_target_acquire:
-            return (
-                offload_get_message_str(msg_c_report_host_target_acquire));
-        case c_offload_host_wait_deps:
-            return (offload_get_message_str(msg_c_report_host_wait_deps));
-        case c_offload_host_setup_buffers:
-            return (offload_get_message_str(msg_c_report_host_setup_buffers));
-        case c_offload_host_alloc_buffers:
-            return (offload_get_message_str(msg_c_report_host_alloc_buffers));
-        case c_offload_host_setup_misc_data:
-            return (
-                offload_get_message_str(msg_c_report_host_setup_misc_data));
-        case c_offload_host_alloc_data_buffer:
-            return (
-                offload_get_message_str(msg_c_report_host_alloc_data_buffer));
-        case c_offload_host_send_pointers:
-            return (offload_get_message_str(msg_c_report_host_send_pointers));
-        case c_offload_host_gather_inputs:
-            return (offload_get_message_str(msg_c_report_host_gather_inputs));
-        case c_offload_host_map_in_data_buffer:
-            return (
-                offload_get_message_str(msg_c_report_host_map_in_data_buffer));
-        case c_offload_host_unmap_in_data_buffer:
-            return (offload_get_message_str(
-                msg_c_report_host_unmap_in_data_buffer));
-        case c_offload_host_start_compute:
-            return (offload_get_message_str(msg_c_report_host_start_compute));
-        case c_offload_host_wait_compute:
-            return (offload_get_message_str(msg_c_report_host_wait_compute));
-        case c_offload_host_start_buffers_reads:
-            return (offload_get_message_str(
-                msg_c_report_host_start_buffers_reads));
-        case c_offload_host_scatter_outputs:
-            return (
-                offload_get_message_str(msg_c_report_host_scatter_outputs));
-        case c_offload_host_map_out_data_buffer:
-            return (offload_get_message_str(
-                msg_c_report_host_map_out_data_buffer));
-        case c_offload_host_unmap_out_data_buffer:
-            return (offload_get_message_str(
-                msg_c_report_host_unmap_out_data_buffer));
-        case c_offload_host_wait_buffers_reads:
-            return (
-                offload_get_message_str(msg_c_report_host_wait_buffers_reads));
-        case c_offload_host_destroy_buffers:
-            return (
-                offload_get_message_str(msg_c_report_host_destroy_buffers));
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_timer_node);
-            abort();
-    }
-}
-
-char const * report_get_target_stage_str(int i)
-{
-    switch (i) {
-        case c_offload_target_total_time:
-            return (offload_get_message_str(msg_c_report_target_total_time));
-        case c_offload_target_descriptor_setup:
-            return (
-                offload_get_message_str(msg_c_report_target_descriptor_setup));
-        case c_offload_target_func_lookup:
-            return (offload_get_message_str(msg_c_report_target_func_lookup));
-        case c_offload_target_func_time:
-            return (offload_get_message_str(msg_c_report_target_func_time));
-        case c_offload_target_scatter_inputs:
-            return (
-                offload_get_message_str(msg_c_report_target_scatter_inputs));
-        case c_offload_target_add_buffer_refs:
-            return (
-                offload_get_message_str(msg_c_report_target_add_buffer_refs));
-        case c_offload_target_compute:
-            return (offload_get_message_str(msg_c_report_target_compute));
-        case c_offload_target_gather_outputs:
-            return (offload_get_message_str
-                (msg_c_report_target_gather_outputs));
-        case c_offload_target_release_buffer_refs:
-            return (offload_get_message_str(
-                msg_c_report_target_release_buffer_refs));
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_timer_node);
-            abort();
-    }
-}
diff --git a/offload/src/liboffload_error_codes.h b/offload/src/liboffload_error_codes.h
deleted file mode 100644
index 982167b11..000000000
--- a/offload/src/liboffload_error_codes.h
+++ /dev/null
@@ -1,276 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if !defined(LIBOFFLOAD_ERROR_CODES_H)
-#define LIBOFFLOAD_ERROR_CODES_H
-#include <stdarg.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-typedef enum
-{
-    c_device_is_not_available = 0,
-    c_invalid_device_number,
-    c_offload1,
-    c_unknown_var_type,
-    c_send_func_ptr,
-    c_receive_func_ptr,
-    c_offload_malloc,
-    c_invalid_env_var_value,
-    c_invalid_env_var_int_value,
-    c_invalid_env_report_value,
-    c_offload_signaled1,
-    c_offload_signaled2,
-    c_myotarget_checkresult,
-    c_myowrapper_checkresult,
-    c_offload_descriptor_offload,
-    c_merge_var_descs1,
-    c_merge_var_descs2,
-    c_mic_parse_env_var_list1,
-    c_mic_parse_env_var_list2,
-    c_mic_process_exit_ret,
-    c_mic_process_exit_sig,
-    c_mic_process_exit,
-    c_mic_init3,
-    c_mic_init4,
-    c_mic_init5,
-    c_mic_init6,
-    c_no_static_var_data,
-    c_no_ptr_data,
-    c_get_engine_handle,
-    c_get_engine_index,
-    c_process_create,
-    c_process_get_func_handles,
-    c_process_wait_shutdown,
-    c_process_proxy_flush,
-    c_load_library,
-    c_pipeline_create,
-    c_pipeline_run_func,
-    c_pipeline_start_run_funcs,
-    c_buf_create,
-    c_buf_create_out_of_mem,
-    c_buf_create_from_mem,
-    c_buf_destroy,
-    c_buf_map,
-    c_buf_unmap,
-    c_buf_read,
-    c_buf_write,
-    c_buf_copy,
-    c_buf_get_address,
-    c_buf_add_ref,
-    c_buf_release_ref,
-    c_buf_set_state,
-    c_event_wait,
-    c_zero_or_neg_ptr_len,
-    c_zero_or_neg_transfer_size,
-    c_bad_ptr_mem_range,
-    c_different_src_and_dstn_sizes,
-    c_ranges_dont_match,
-    c_destination_is_over,
-    c_slice_of_noncont_array,
-    c_non_contiguous_dope_vector,
-    c_pointer_array_mismatch,
-    c_omp_invalid_device_num_env,
-    c_omp_invalid_device_num,
-    c_unknown_binary_type,
-    c_multiple_target_exes,
-    c_no_target_exe,
-    c_report_host,
-    c_report_target,
-    c_report_title,
-    c_report_from_file,
-    c_report_file,
-    c_report_line,
-    c_report_tag,
-    c_report_seconds,
-    c_report_bytes,
-    c_report_mic,
-    c_report_cpu_time,
-    c_report_cpu_to_mic_data,
-    c_report_mic_time,
-    c_report_mic_to_cpu_data,
-    c_report_unknown_timer_node,
-    c_report_unknown_trace_node,
-    c_report_offload,
-    c_report_w_tag,
-    c_report_state,
-    c_report_start,
-    c_report_init,
-    c_report_logical_card,
-    c_report_physical_card,
-    c_report_register,
-    c_report_init_func,
-    c_report_create_buf_host,
-    c_report_create_buf_mic,
-    c_report_send_pointer_data,
-    c_report_sent_pointer_data,
-    c_report_gather_copyin_data,
-    c_report_copyin_data,
-    c_report_state_signal,
-    c_report_signal,
-    c_report_wait,
-    c_report_compute,
-    c_report_receive_pointer_data,
-    c_report_received_pointer_data,
-    c_report_start_target_func,
-    c_report_var,
-    c_report_scatter_copyin_data,
-    c_report_gather_copyout_data,
-    c_report_scatter_copyout_data,
-    c_report_copyout_data,
-    c_report_unregister,
-    c_report_destroy,
-    c_report_myoinit,
-    c_report_myoregister,
-    c_report_myofini,
-    c_report_mic_myo_shared,
-    c_report_mic_myo_fptr,
-    c_report_myosharedmalloc,
-    c_report_myosharedfree,
-    c_report_myosharedalignedmalloc,
-    c_report_myosharedalignedfree,
-    c_report_myoacquire,
-    c_report_myorelease,
-    c_coipipe_max_number
-} error_types;
-
-enum OffloadHostPhase {
-    // Total time on host for entire offload
-    c_offload_host_total_offload = 0,
-
-    // Time to load target binary
-    c_offload_host_initialize,
-
-    // Time to acquire lrb availability dynamically
-    c_offload_host_target_acquire,
-
-    // Time to wait for dependencies
-    c_offload_host_wait_deps,
-
-    // Time to allocate pointer buffers, initiate writes for pointers
-    // and calculate size of copyin/copyout buffer
-    c_offload_host_setup_buffers,
-
-    // Time to allocate pointer buffers
-    c_offload_host_alloc_buffers,
-
-    // Time to initialize misc data
-    c_offload_host_setup_misc_data,
-
-    // Time to allocate copyin/copyout buffer
-    c_offload_host_alloc_data_buffer,
-
-    // Time to initiate writes from host pointers to buffers
-    c_offload_host_send_pointers,
-
-    // Time to Gather IN data of offload into buffer
-    c_offload_host_gather_inputs,
-
-    // Time to map buffer
-    c_offload_host_map_in_data_buffer,
-
-    // Time to unmap buffer
-    c_offload_host_unmap_in_data_buffer,
-
-    // Time to start remote function call that does computation on lrb
-    c_offload_host_start_compute,
-
-    // Time to wait for compute to finish
-    c_offload_host_wait_compute,
-
-    // Time to initiate reads from pointer buffers
-    c_offload_host_start_buffers_reads,
-
-    // Time to update host variabels with OUT data from buffer
-    c_offload_host_scatter_outputs,
-
-    // Time to map buffer
-    c_offload_host_map_out_data_buffer,
-
-    // Time to unmap buffer
-    c_offload_host_unmap_out_data_buffer,
-
-    // Time to wait reads from buffers to finish
-    c_offload_host_wait_buffers_reads,
-
-    // Time to destroy buffers that are no longer needed
-    c_offload_host_destroy_buffers,
-
-    // LAST TIME MONITOR
-    c_offload_host_max_phase
-};
-
-enum OffloadTargetPhase {
-    // Total time spent on the target
-    c_offload_target_total_time = 0,
-
-    // Time to initialize offload descriptor
-    c_offload_target_descriptor_setup,
-
-    // Time to find target entry point in lookup table
-    c_offload_target_func_lookup,
-
-    // Total time spend executing offload entry
-    c_offload_target_func_time,
-
-    // Time to initialize target variables with IN values from buffer
-    c_offload_target_scatter_inputs,
-
-    // Time to add buffer reference for pointer buffers
-    c_offload_target_add_buffer_refs,
-
-    // Total time on lrb for computation
-    c_offload_target_compute,
-
-    // On lrb, time to copy OUT into buffer
-    c_offload_target_gather_outputs,
-
-    // Time to release buffer references
-    c_offload_target_release_buffer_refs,
-
-    // LAST TIME MONITOR
-    c_offload_target_max_phase
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-void __liboffload_error_support(error_types input_tag, ...);
-void __liboffload_report_support(error_types input_tag, ...);
-char const *offload_get_message_str(int msgCode);
-char const * report_get_message_str(error_types input_tag);
-char const * report_get_host_stage_str(int i);
-char const * report_get_target_stage_str(int i);
-#ifdef __cplusplus
-}
-#endif
-
-#define test_msg_cat(nm, msg) \
-    fprintf(stderr, "\t TEST for %s \n \t", nm); \
-    __liboffload_error_support(msg);
-
-#define test_msg_cat1(nm, msg, ...) \
-    fprintf(stderr, "\t TEST for %s \n \t", nm); \
-    __liboffload_error_support(msg, __VA_ARGS__);
-
-void write_message(FILE * file, int msgCode, va_list args_p);
-
-#define LIBOFFLOAD_ERROR __liboffload_error_support
-
-#ifdef TARGET_WINNT
-#define LIBOFFLOAD_ABORT \
-         _set_abort_behavior(0, _WRITE_ABORT_MSG); \
-         abort()
-#else
-#define LIBOFFLOAD_ABORT \
-         abort()
-#endif
-
-#endif // !defined(LIBOFFLOAD_ERROR_CODES_H)
diff --git a/offload/src/liboffload_msg.c b/offload/src/liboffload_msg.c
deleted file mode 100644
index b160392b1..000000000
--- a/offload/src/liboffload_msg.c
+++ /dev/null
@@ -1,35 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-
-#include <stdio.h>
-
-// ===========================================================================
-// Bring in the static string table and the enumerations for indexing into
-// it.
-// ===========================================================================
-
-#include "liboffload_msg.h"
-
-# define DYNART_STDERR_PUTS(__message_text__) fputs((__message_text__),stderr)
-
-// ===========================================================================
-// Now the code for accessing the message catalogs
-// ===========================================================================
-
-
-    void write_message(FILE * file, int msgCode) {
-        fputs(MESSAGE_TABLE_NAME[ msgCode ], file);
-        fflush(file);
-    }
-
-    char const *offload_get_message_str(int msgCode) {
-        return MESSAGE_TABLE_NAME[ msgCode ];
-    }
diff --git a/offload/src/liboffload_msg.h b/offload/src/liboffload_msg.h
deleted file mode 100644
index c1445f917..000000000
--- a/offload/src/liboffload_msg.h
+++ /dev/null
@@ -1,326 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-// file: liboffload_msg.h
-enum {
-	__dummy__ = 0,
-	msg_c_device_is_not_available,
-	msg_c_invalid_device_number,
-	msg_c_send_func_ptr,
-	msg_c_receive_func_ptr,
-	msg_c_offload_malloc,
-	msg_c_offload1,
-	msg_c_unknown_var_type,
-	msg_c_invalid_env_var_value,
-	msg_c_invalid_env_var_int_value,
-	msg_c_invalid_env_report_value,
-	msg_c_offload_signaled1,
-	msg_c_offload_signaled2,
-	msg_c_myowrapper_checkresult,
-	msg_c_myotarget_checkresult,
-	msg_c_offload_descriptor_offload,
-	msg_c_merge_var_descs1,
-	msg_c_merge_var_descs2,
-	msg_c_mic_parse_env_var_list1,
-	msg_c_mic_parse_env_var_list2,
-	msg_c_mic_process_exit_ret,
-	msg_c_mic_process_exit_sig,
-	msg_c_mic_process_exit,
-	msg_c_mic_init3,
-	msg_c_mic_init4,
-	msg_c_mic_init5,
-	msg_c_mic_init6,
-	msg_c_no_static_var_data,
-	msg_c_no_ptr_data,
-	msg_c_get_engine_handle,
-	msg_c_get_engine_index,
-	msg_c_process_create,
-	msg_c_process_get_func_handles,
-	msg_c_process_wait_shutdown,
-	msg_c_process_proxy_flush,
-	msg_c_load_library,
-	msg_c_pipeline_create,
-	msg_c_pipeline_run_func,
-	msg_c_pipeline_start_run_funcs,
-	msg_c_buf_create,
-	msg_c_buf_create_out_of_mem,
-	msg_c_buf_create_from_mem,
-	msg_c_buf_destroy,
-	msg_c_buf_map,
-	msg_c_buf_unmap,
-	msg_c_buf_read,
-	msg_c_buf_write,
-	msg_c_buf_copy,
-	msg_c_buf_get_address,
-	msg_c_buf_add_ref,
-	msg_c_buf_release_ref,
-	msg_c_buf_set_state,
-	msg_c_event_wait,
-	msg_c_zero_or_neg_ptr_len,
-	msg_c_zero_or_neg_transfer_size,
-	msg_c_bad_ptr_mem_range,
-	msg_c_different_src_and_dstn_sizes,
-	msg_c_non_contiguous_dope_vector,
-	msg_c_omp_invalid_device_num_env,
-	msg_c_omp_invalid_device_num,
-	msg_c_unknown_binary_type,
-	msg_c_multiple_target_exes,
-	msg_c_no_target_exe,
-	msg_c_report_unknown_timer_node,
-	msg_c_report_unknown_trace_node,
-	msg_c_report_host,
-	msg_c_report_mic,
-	msg_c_report_title,
-	msg_c_report_seconds,
-	msg_c_report_bytes,
-	msg_c_report_cpu_time,
-	msg_c_report_mic_time,
-	msg_c_report_tag,
-	msg_c_report_from_file,
-	msg_c_report_file,
-	msg_c_report_line,
-	msg_c_report_cpu_to_mic_data,
-	msg_c_report_mic_to_cpu_data,
-	msg_c_report_offload,
-	msg_c_report_w_tag,
-	msg_c_report_state,
-	msg_c_report_start,
-	msg_c_report_init,
-	msg_c_report_logical_card,
-	msg_c_report_physical_card,
-	msg_c_report_register,
-	msg_c_report_init_func,
-	msg_c_report_create_buf_host,
-	msg_c_report_create_buf_mic,
-	msg_c_report_send_pointer_data,
-	msg_c_report_sent_pointer_data,
-	msg_c_report_gather_copyin_data,
-	msg_c_report_copyin_data,
-	msg_c_report_state_signal,
-	msg_c_report_signal,
-	msg_c_report_wait,
-	msg_c_report_compute,
-	msg_c_report_receive_pointer_data,
-	msg_c_report_received_pointer_data,
-	msg_c_report_start_target_func,
-	msg_c_report_var,
-	msg_c_report_scatter_copyin_data,
-	msg_c_report_gather_copyout_data,
-	msg_c_report_scatter_copyout_data,
-	msg_c_report_copyout_data,
-	msg_c_report_unregister,
-	msg_c_report_destroy,
-	msg_c_report_myoinit,
-	msg_c_report_myoregister,
-	msg_c_report_myofini,
-	msg_c_report_mic_myo_shared,
-	msg_c_report_mic_myo_fptr,
-	msg_c_report_myosharedmalloc,
-	msg_c_report_myosharedfree,
-	msg_c_report_myosharedalignedmalloc,
-	msg_c_report_myosharedalignedfree,
-	msg_c_report_myoacquire,
-	msg_c_report_myorelease,
-	msg_c_report_host_total_offload_time,
-	msg_c_report_host_initialize,
-	msg_c_report_host_target_acquire,
-	msg_c_report_host_wait_deps,
-	msg_c_report_host_setup_buffers,
-	msg_c_report_host_alloc_buffers,
-	msg_c_report_host_setup_misc_data,
-	msg_c_report_host_alloc_data_buffer,
-	msg_c_report_host_send_pointers,
-	msg_c_report_host_gather_inputs,
-	msg_c_report_host_map_in_data_buffer,
-	msg_c_report_host_unmap_in_data_buffer,
-	msg_c_report_host_start_compute,
-	msg_c_report_host_wait_compute,
-	msg_c_report_host_start_buffers_reads,
-	msg_c_report_host_scatter_outputs,
-	msg_c_report_host_map_out_data_buffer,
-	msg_c_report_host_unmap_out_data_buffer,
-	msg_c_report_host_wait_buffers_reads,
-	msg_c_report_host_destroy_buffers,
-	msg_c_report_target_total_time,
-	msg_c_report_target_descriptor_setup,
-	msg_c_report_target_func_lookup,
-	msg_c_report_target_func_time,
-	msg_c_report_target_scatter_inputs,
-	msg_c_report_target_add_buffer_refs,
-	msg_c_report_target_compute,
-	msg_c_report_target_gather_outputs,
-	msg_c_report_target_release_buffer_refs,
-	msg_c_coi_pipeline_max_number,
-	msg_c_ranges_dont_match,
-	msg_c_destination_is_over,
-	msg_c_slice_of_noncont_array,
-	msg_c_pointer_array_mismatch,
-	lastMsg = 152,
-	firstMsg = 1
-};
-
-
-#if !defined(MESSAGE_TABLE_NAME)
-#    define MESSAGE_TABLE_NAME __liboffload_message_table
-#endif
-
-static char const * MESSAGE_TABLE_NAME[] = {
-	/*   0 __dummy__                               */ "Un-used message",
-	/*   1 msg_c_device_is_not_available           */ "offload error: cannot offload to MIC - device is not available",
-	/*   2 msg_c_invalid_device_number             */ "offload error: expected a number greater than or equal to -1",
-	/*   3 msg_c_send_func_ptr                     */ "offload error: cannot find function name for address %p",
-	/*   4 msg_c_receive_func_ptr                  */ "offload error: cannot find address of function %s",
-	/*   5 msg_c_offload_malloc                    */ "offload error: memory allocation failed (requested=%lld bytes, align %lld)",
-	/*   6 msg_c_offload1                          */ "offload error: device %d does not have a pending signal for wait(%p)",
-	/*   7 msg_c_unknown_var_type                  */ "offload error: unknown variable type %d",
-	/*   8 msg_c_invalid_env_var_value             */ "offload warning: ignoring invalid value specified for %s",
-	/*   9 msg_c_invalid_env_var_int_value         */ "offload warning: specify an integer value for %s",
-	/*  10 msg_c_invalid_env_report_value          */ "offload warning: ignoring %s setting; use a value in range 1-3",
-	/*  11 msg_c_offload_signaled1                 */ "offload error: invalid device number %d specified in _Offload_signaled",
-	/*  12 msg_c_offload_signaled2                 */ "offload error: invalid signal %p specified for _Offload_signaled",
-	/*  13 msg_c_myowrapper_checkresult            */ "offload error: %s failed with error %d",
-	/*  14 msg_c_myotarget_checkresult             */ "offload error: %s failed with error %d",
-	/*  15 msg_c_offload_descriptor_offload        */ "offload error: cannot find offload entry %s",
-	/*  16 msg_c_merge_var_descs1                  */ "offload error: unexpected number of variable descriptors",
-	/*  17 msg_c_merge_var_descs2                  */ "offload error: unexpected variable type",
-	/*  18 msg_c_mic_parse_env_var_list1           */ "offload_error: MIC environment variable must begin with an alpabetic character",
-	/*  19 msg_c_mic_parse_env_var_list2           */ "offload_error: MIC environment variable value must be specified with �=�",
-	/*  20 msg_c_mic_process_exit_ret              */ "offload error: process on the device %d unexpectedly exited with code %d",
-	/*  21 msg_c_mic_process_exit_sig              */ "offload error: process on the device %d was terminated by signal %d (%s)",
-	/*  22 msg_c_mic_process_exit                  */ "offload error: process on the device %d was unexpectedly terminated",
-	/*  23 msg_c_mic_init3                         */ "offload warning: ignoring MIC_STACKSIZE setting; use a value >= 16K and a multiple of 4K",
-	/*  24 msg_c_mic_init4                         */ "offload error: thread key create failed with error %d",
-	/*  25 msg_c_mic_init5                         */ "offload warning: specify OFFLOAD_DEVICES as comma-separated physical device numbers or 'none'",
-	/*  26 msg_c_mic_init6                         */ "offload warning: OFFLOAD_DEVICES device number %d does not correspond to a physical device",
-	/*  27 msg_c_no_static_var_data                */ "offload error: cannot find data associated with statically allocated variable %p",
-	/*  28 msg_c_no_ptr_data                       */ "offload error: cannot find data associated with pointer variable %p",
-	/*  29 msg_c_get_engine_handle                 */ "offload error: cannot get device %d handle (error code %d)",
-	/*  30 msg_c_get_engine_index                  */ "offload error: cannot get physical index for logical device %d (error code %d)",
-	/*  31 msg_c_process_create                    */ "offload error: cannot start process on the device %d (error code %d)",
-	/*  32 msg_c_process_get_func_handles          */ "offload error: cannot get function handles on the device %d (error code %d)",
-	/*  33 msg_c_process_wait_shutdown             */ "offload error: wait for process shutdown failed on device %d (error code %d)",
-	/*  34 msg_c_process_proxy_flush               */ "offload error: cannot flush process output on device %d (error code %d)",
-	/*  35 msg_c_load_library                      */ "offload error: cannot load library to the device %d (error code %d)",
-	/*  36 msg_c_pipeline_create                   */ "offload error: cannot create pipeline on the device %d (error code %d)",
-	/*  37 msg_c_pipeline_run_func                 */ "offload error: cannot execute function on the device %d (error code %d)",
-	/*  38 msg_c_pipeline_start_run_funcs          */ "offload error: cannot start executing pipeline function on the device %d (error code %d)",
-	/*  39 msg_c_buf_create                        */ "offload error: cannot create buffer on device %d (error code %d)",
-	/*  40 msg_c_buf_create_out_of_mem             */ "offload error: cannot create buffer on device %d, out of memory",
-	/*  41 msg_c_buf_create_from_mem               */ "offload error: cannot create buffer from memory on device %d (error code %d)",
-	/*  42 msg_c_buf_destroy                       */ "offload error: buffer destroy failed (error code %d)",
-	/*  43 msg_c_buf_map                           */ "offload error: buffer map failed (error code %d)",
-	/*  44 msg_c_buf_unmap                         */ "offload error: buffer unmap failed (error code %d)",
-	/*  45 msg_c_buf_read                          */ "offload error: buffer read failed (error code %d)",
-	/*  46 msg_c_buf_write                         */ "offload error: buffer write failed (error code %d)",
-	/*  47 msg_c_buf_copy                          */ "offload error: buffer copy failed (error code %d)",
-	/*  48 msg_c_buf_get_address                   */ "offload error: cannot get buffer address on device %d (error code %d)",
-	/*  49 msg_c_buf_add_ref                       */ "offload error: cannot reuse buffer memory on device %d (error code %d)",
-	/*  50 msg_c_buf_release_ref                   */ "offload error: cannot release buffer memory on device %d (error code %d)",
-	/*  51 msg_c_buf_set_state                     */ "offload error: buffer set state failed (error code %d)",
-	/*  52 msg_c_event_wait                        */ "offload error: wait for event to become signaled failed (error code %d)",
-	/*  53 msg_c_zero_or_neg_ptr_len               */ "offload error: memory allocation of negative length is not supported",
-	/*  54 msg_c_zero_or_neg_transfer_size         */ "offload error: data transfer of zero or negative size is not supported",
-	/*  55 msg_c_bad_ptr_mem_range                 */ "offload error: address range partially overlaps with existing allocation",
-	/*  56 msg_c_different_src_and_dstn_sizes      */ "offload error: size of the source %d differs from size of the destination %d",
-	/*  57 msg_c_non_contiguous_dope_vector        */ "offload error: offload data transfer supports only a single contiguous memory range per variable",
-	/*  58 msg_c_omp_invalid_device_num_env        */ "offload warning: ignoring %s setting; use a non-negative integer value",
-	/*  59 msg_c_omp_invalid_device_num            */ "offload error: device number should be a non-negative integer value",
-	/*  60 msg_c_unknown_binary_type               */ "offload error: unexpected embedded target binary type, expected either an executable or shared library",
-	/*  61 msg_c_multiple_target_exes              */ "offload error: more that one target executable found",
-	/*  62 msg_c_no_target_exe                     */ "offload error: target executable is not available",
-	/*  63 msg_c_report_unknown_timer_node         */ "offload error: unknown timer node",
-	/*  64 msg_c_report_unknown_trace_node         */ "offload error: unknown trace node",
-	/*  65 msg_c_report_host                       */ "HOST",
-	/*  66 msg_c_report_mic                        */ "MIC",
-	/*  67 msg_c_report_title                      */ "timer data       (sec)",
-	/*  68 msg_c_report_seconds                    */ "(seconds)",
-	/*  69 msg_c_report_bytes                      */ "(bytes)",
-	/*  70 msg_c_report_cpu_time                   */ "CPU Time",
-	/*  71 msg_c_report_mic_time                   */ "MIC Time",
-	/*  72 msg_c_report_tag                        */ "Tag",
-	/*  73 msg_c_report_from_file                  */ "Offload from file",
-	/*  74 msg_c_report_file                       */ "File",
-	/*  75 msg_c_report_line                       */ "Line",
-	/*  76 msg_c_report_cpu_to_mic_data            */ "CPU->MIC Data",
-	/*  77 msg_c_report_mic_to_cpu_data            */ "MIC->CPU Data",
-	/*  78 msg_c_report_offload                    */ "Offload",
-	/*  79 msg_c_report_w_tag                      */ "Tag %d",
-	/*  80 msg_c_report_state                      */ "State",
-	/*  81 msg_c_report_start                      */ "Start target",
-	/*  82 msg_c_report_init                       */ "Initialize",
-	/*  83 msg_c_report_logical_card               */ "logical card",
-	/*  84 msg_c_report_physical_card              */ "physical card",
-	/*  85 msg_c_report_register                   */ "Register static data tables",
-	/*  86 msg_c_report_init_func                  */ "Setup target entry",
-	/*  87 msg_c_report_create_buf_host            */ "Create host buffer",
-	/*  88 msg_c_report_create_buf_mic             */ "Create target buffer",
-	/*  89 msg_c_report_send_pointer_data          */ "Send pointer data",
-	/*  90 msg_c_report_sent_pointer_data          */ "Host->target pointer data",
-	/*  91 msg_c_report_gather_copyin_data         */ "Gather copyin data",
-	/*  92 msg_c_report_copyin_data                */ "Host->target copyin data",
-	/*  93 msg_c_report_state_signal               */ "Signal",
-	/*  94 msg_c_report_signal                     */ "signal :",
-	/*  95 msg_c_report_wait                       */ "waits  :",
-	/*  96 msg_c_report_compute                    */ "Execute task on target",
-	/*  97 msg_c_report_receive_pointer_data       */ "Receive pointer data",
-	/*  98 msg_c_report_received_pointer_data      */ "Target->host pointer data",
-	/*  99 msg_c_report_start_target_func          */ "Start target entry",
-	/* 100 msg_c_report_var                        */ "Var",
-	/* 101 msg_c_report_scatter_copyin_data        */ "Scatter copyin data",
-	/* 102 msg_c_report_gather_copyout_data        */ "Gather copyout data",
-	/* 103 msg_c_report_scatter_copyout_data       */ "Scatter copyout data",
-	/* 104 msg_c_report_copyout_data               */ "Target->host copyout data",
-	/* 105 msg_c_report_unregister                 */ "Unregister data tables",
-	/* 106 msg_c_report_destroy                    */ "Destroy",
-	/* 107 msg_c_report_myoinit                    */ "Initialize MYO",
-	/* 108 msg_c_report_myoregister                */ "Register MYO tables",
-	/* 109 msg_c_report_myofini                    */ "Finalize MYO",
-	/* 110 msg_c_report_mic_myo_shared             */ "MIC MYO shared table register",
-	/* 111 msg_c_report_mic_myo_fptr               */ "MIC MYO fptr table register",
-	/* 112 msg_c_report_myosharedmalloc            */ "MYO shared malloc",
-	/* 113 msg_c_report_myosharedfree              */ "MYO shared free",
-	/* 114 msg_c_report_myosharedalignedmalloc     */ "MYO shared aligned malloc",
-	/* 115 msg_c_report_myosharedalignedfree       */ "MYO shared aligned free",
-	/* 116 msg_c_report_myoacquire                 */ "MYO acquire",
-	/* 117 msg_c_report_myorelease                 */ "MYO release",
-	/* 118 msg_c_report_host_total_offload_time    */ "host: total offload time",
-	/* 119 msg_c_report_host_initialize            */ "host: initialize target",
-	/* 120 msg_c_report_host_target_acquire        */ "host: acquire target",
-	/* 121 msg_c_report_host_wait_deps             */ "host: wait dependencies",
-	/* 122 msg_c_report_host_setup_buffers         */ "host: setup buffers",
-	/* 123 msg_c_report_host_alloc_buffers         */ "host: allocate buffers",
-	/* 124 msg_c_report_host_setup_misc_data       */ "host: setup misc_data",
-	/* 125 msg_c_report_host_alloc_data_buffer     */ "host: allocate buffer",
-	/* 126 msg_c_report_host_send_pointers         */ "host: send pointers",
-	/* 127 msg_c_report_host_gather_inputs         */ "host: gather inputs",
-	/* 128 msg_c_report_host_map_in_data_buffer    */ "host: map IN data buffer",
-	/* 129 msg_c_report_host_unmap_in_data_buffer  */ "host: unmap IN data buffer",
-	/* 130 msg_c_report_host_start_compute         */ "host: initiate compute",
-	/* 131 msg_c_report_host_wait_compute          */ "host: wait compute",
-	/* 132 msg_c_report_host_start_buffers_reads   */ "host: initiate pointer reads",
-	/* 133 msg_c_report_host_scatter_outputs       */ "host: scatter outputs",
-	/* 134 msg_c_report_host_map_out_data_buffer   */ "host: map OUT data buffer",
-	/* 135 msg_c_report_host_unmap_out_data_buffer */ "host: unmap OUT data buffer",
-	/* 136 msg_c_report_host_wait_buffers_reads    */ "host: wait pointer reads",
-	/* 137 msg_c_report_host_destroy_buffers       */ "host: destroy buffers",
-	/* 138 msg_c_report_target_total_time          */ "target: total time",
-	/* 139 msg_c_report_target_descriptor_setup    */ "target: setup offload descriptor",
-	/* 140 msg_c_report_target_func_lookup         */ "target: entry lookup",
-	/* 141 msg_c_report_target_func_time           */ "target: entry time",
-	/* 142 msg_c_report_target_scatter_inputs      */ "target: scatter inputs",
-	/* 143 msg_c_report_target_add_buffer_refs     */ "target: add buffer reference",
-	/* 144 msg_c_report_target_compute             */ "target: compute",
-	/* 145 msg_c_report_target_gather_outputs      */ "target: gather outputs",
-	/* 146 msg_c_report_target_release_buffer_refs */ "target: remove buffer reference",
-	/* 147 msg_c_coi_pipeline_max_number           */ "number of host threads doing offload exceeds maximum of %d",
-	/* 148 msg_c_ranges_dont_match                 */ "ranges of source and destination don't match together",
-	/* 149 msg_c_destination_is_over               */ "insufficient destination memory to transfer source",
-	/* 150 msg_c_slice_of_noncont_array            */ "a non-contiguous slice may be taken of contiguous arrays only",
-	/* 151 msg_c_pointer_array_mismatch            */ "number of %s elements is less than described by the source",
-};
diff --git a/offload/src/mic_lib.f90 b/offload/src/mic_lib.f90
deleted file mode 100644
index 0c2e4de0b..000000000
--- a/offload/src/mic_lib.f90
+++ /dev/null
@@ -1,441 +0,0 @@
-!
-!//===----------------------------------------------------------------------===//
-!//
-!//                     The LLVM Compiler Infrastructure
-!//
-!// This file is dual licensed under the MIT and the University of Illinois Open
-!// Source Licenses. See LICENSE.txt for details.
-!//
-!//===----------------------------------------------------------------------===//
-!
-
-
-! **********************************************************************************
-! * This file is intended to support the Intel(r) Many Integrated Core Architecture.
-! **********************************************************************************
-! free form Fortran source - should be named .f90
-! lines are longer than 72 characters
-
-module mic_lib
-use, intrinsic :: iso_c_binding
-
-integer, parameter:: target_mic=2
-integer, parameter:: default_target_type=target_mic
-integer, parameter:: default_target_number=0
-
-enum, bind(C)
-    enumerator :: OFFLOAD_SUCCESS  = 0
-    enumerator :: OFFLOAD_DISABLED          ! offload is disabled
-    enumerator :: OFFLOAD_UNAVAILABLE       ! card is not available
-    enumerator :: OFFLOAD_OUT_OF_MEMORY     ! not enough memory on device
-    enumerator :: OFFLOAD_PROCESS_DIED      ! target process has died
-    enumerator :: OFFLOAD_ERROR             ! unspecified error
-end enum
-
-type, bind (C) :: offload_status
-    integer(kind=c_int)    ::  result          = OFFLOAD_DISABLED
-    integer(kind=c_int)    ::  device_number   = -1
-    integer(kind=c_size_t) ::  data_sent       = 0
-    integer(kind=c_size_t) ::  data_received   = 0
-end type offload_status
-
-interface
-function offload_number_of_devices ()                                  &
-           bind (C, name = "_Offload_number_of_devices")
-!dec$ attributes default :: offload_number_of_devices
-  import :: c_int
-  integer (kind=c_int)        :: offload_number_of_devices
-!dec$ attributes offload:mic :: offload_number_of_devices
-!dir$ attributes known_intrinsic ::  offload_number_of_devices
-end function offload_number_of_devices
-
-function offload_signaled(target_number, signal)                       &
-           bind (C, name = "_Offload_signaled")
-!dec$ attributes default :: offload_signaled
-  import :: c_int, c_int64_t
-  integer (kind=c_int) :: offload_signaled
-  integer (kind=c_int), value :: target_number
-  integer (kind=c_int64_t), value :: signal
-!dec$ attributes offload:mic :: offload_signaled
-end function offload_signaled
-
-subroutine offload_report(val)                                         &
-           bind (C, name = "_Offload_report")
-!dec$ attributes default :: offload_report
-  import :: c_int
-  integer (kind=c_int), value :: val
-!dec$ attributes offload:mic :: offload_report
-end subroutine offload_report
-
-function offload_get_device_number()                                   &
-           bind (C, name = "_Offload_get_device_number")
-!dec$ attributes default :: offload_get_device_number
-  import :: c_int
-  integer (kind=c_int)        :: offload_get_device_number
-!dec$ attributes offload:mic :: offload_get_device_number
-end function offload_get_device_number
-
-function offload_get_physical_device_number()                          &
-           bind (C, name = "_Offload_get_physical_device_number")
-!dec$ attributes default :: offload_get_physical_device_number
-  import :: c_int
-  integer (kind=c_int)        :: offload_get_physical_device_number
-!dec$ attributes offload:mic :: offload_get_physical_device_number
-end function offload_get_physical_device_number
-
-! OpenMP API wrappers
-
-subroutine omp_set_num_threads_target (target_type,                    &
-                                       target_number,                  &
-                                       num_threads)                    &
-           bind (C, name = "omp_set_num_threads_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, num_threads
-end subroutine omp_set_num_threads_target
-
-function omp_get_max_threads_target (target_type,                      &
-                                     target_number)                    &
-         bind (C, name = "omp_get_max_threads_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_max_threads_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_max_threads_target
-
-function omp_get_num_procs_target (target_type,                        &
-                                   target_number)                      &
-         bind (C, name = "omp_get_num_procs_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_num_procs_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_num_procs_target
-
-subroutine omp_set_dynamic_target (target_type,                        &
-                                   target_number,                      &
-                                   num_threads)                        &
-           bind (C, name = "omp_set_dynamic_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, num_threads
-end subroutine omp_set_dynamic_target
-
-function omp_get_dynamic_target (target_type,                          &
-                                 target_number)                        &
-         bind (C, name = "omp_get_dynamic_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_dynamic_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_dynamic_target
-
-subroutine omp_set_nested_target (target_type,                         &
-                                  target_number,                       &
-                                  nested)                              &
-           bind (C, name = "omp_set_nested_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, nested
-end subroutine omp_set_nested_target
-
-function omp_get_nested_target (target_type,                           &
-                                target_number)                         &
-         bind (C, name = "omp_get_nested_target")
-  import :: c_int
-  integer (kind=c_int)        :: omp_get_nested_target
-  integer (kind=c_int), value :: target_type, target_number
-end function omp_get_nested_target
-
-subroutine omp_set_schedule_target (target_type,                       &
-                                    target_number,                     &
-                                    kind,                              &
-                                    modifier)                          &
-           bind (C, name = "omp_set_schedule_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, kind, modifier
-end subroutine omp_set_schedule_target
-
-subroutine omp_get_schedule_target (target_type,                       &
-                                    target_number,                     &
-                                    kind,                              &
-                                    modifier)                          &
-           bind (C, name = "omp_get_schedule_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: kind, modifier
-end subroutine omp_get_schedule_target
-
-! lock API functions
-
-subroutine omp_init_lock_target (target_type,                          &
-                                 target_number,                        &
-                                 lock)                                 &
-           bind (C, name = "omp_init_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_init_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_init_lock_target
-
-subroutine omp_destroy_lock_target (target_type,                       &
-                                    target_number,                     &
-                                    lock)                              &
-           bind (C, name = "omp_destroy_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_destroy_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_destroy_lock_target
-
-subroutine omp_set_lock_target (target_type,                           &
-                                target_number,                         &
-                                lock)                                  &
-           bind (C, name = "omp_set_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_set_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_set_lock_target
-
-subroutine omp_unset_lock_target (target_type,                         &
-                                  target_number,                       &
-                                  lock)                                &
-           bind (C, name = "omp_unset_lock_target")
-  import :: c_int, c_intptr_t
-  !dir$ attributes known_intrinsic ::  omp_unset_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_unset_lock_target
-
-function omp_test_lock_target (target_type,                            &
-                               target_number,                          &
-                               lock)                                   &
-           bind (C, name = "omp_test_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: omp_test_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end function omp_test_lock_target
-
-! nested lock API functions
-
-subroutine omp_init_nest_lock_target (target_type,                     &
-                                      target_number,                   &
-                                      lock)                            &
-           bind (C, name = "omp_init_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_init_nest_lock_target
-
-subroutine omp_destroy_nest_lock_target (target_type,                  &
-                                         target_number,                &
-                                         lock)                         &
-           bind (C, name = "omp_destroy_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_destroy_nest_lock_target
-
-subroutine omp_set_nest_lock_target (target_type,                      &
-                                     target_number,                    &
-                                     lock)                             &
-           bind (C, name = "omp_set_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_set_nest_lock_target
-
-subroutine omp_unset_nest_lock_target (target_type,                    &
-                                       target_number,                  &
-                                       lock)                           &
-           bind (C, name = "omp_unset_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end subroutine omp_unset_nest_lock_target
-
-function omp_test_nest_lock_target (target_type,                       &
-                                    target_number,                     &
-                                    lock)                              &
-           bind (C, name = "omp_test_nest_lock_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: omp_test_nest_lock_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: lock
-end function omp_test_nest_lock_target
-
-! kmp API functions
-
-subroutine kmp_set_stacksize_target (target_type,                      &
-                                     target_number,                    &
-                                     size)                             &
-           bind (C, name = "kmp_set_stacksize_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, size
-end subroutine kmp_set_stacksize_target
-
-function kmp_get_stacksize_target (target_type,                        &
-                                   target_number)                      &
-         bind (C, name = "kmp_get_stacksize_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_stacksize_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_stacksize_target
-
-subroutine kmp_set_stacksize_s_target (target_type,                    &
-                                       target_number,                  &
-                                       size)                           &
-           bind (C, name = "kmp_set_stacksize_s_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, size
-end subroutine kmp_set_stacksize_s_target
-
-function kmp_get_stacksize_s_target (target_type,                      &
-                                     target_number)                    &
-         bind (C, name = "kmp_get_stacksize_s_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_stacksize_s_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_stacksize_s_target
-
-subroutine kmp_set_blocktime_target (target_type,                      &
-                                     target_number,                    &
-                                     time)                             &
-           bind (C, name = "kmp_set_blocktime_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, time
-end subroutine kmp_set_blocktime_target
-
-function kmp_get_blocktime_target (target_type,                        &
-                                   target_number)                      &
-         bind (C, name = "kmp_get_blocktime_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_blocktime_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_blocktime_target
-
-subroutine kmp_set_library_serial_target (target_type,                 &
-                                          target_number)               &
-           bind (C, name = "kmp_set_library_serial_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_library_serial_target
-
-subroutine kmp_set_library_turnaround_target (target_type,             &
-                                              target_number)           &
-           bind (C, name = "kmp_set_library_turnaround_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_library_turnaround_target
-
-subroutine kmp_set_library_throughput_target (target_type,             &
-                                              target_number)           &
-           bind (C, name = "kmp_set_library_throughput_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_library_throughput_target
-
-subroutine kmp_set_library_target (target_type,                        &
-                                   target_number,                      &
-                                   mode)                               &
-           bind (C, name = "kmp_set_library_target")
-  import :: c_int
-  integer (kind=c_int), value :: target_type, target_number, mode
-end subroutine kmp_set_library_target
-
-function kmp_get_library_target (target_type,                          &
-                                 target_number)                        &
-         bind (C, name = "kmp_get_library_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_library_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_library_target
-
-subroutine kmp_set_defaults_target (target_type,                       &
-                                    target_number,                     &
-                                    defaults)                          &
-           bind (C, name = "kmp_set_defaults_target")
-  import :: c_int, c_char
- character (kind=c_char) :: defaults(*)
- integer (kind=c_int), value :: target_type, target_number
-end subroutine kmp_set_defaults_target
-
-! affinity API functions
-
-subroutine kmp_create_affinity_mask_target (target_type,               &
-                                            target_number,             &
-                                            mask)                      &
-           bind (C, name = "kmp_create_affinity_mask_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end subroutine kmp_create_affinity_mask_target
-
-subroutine kmp_destroy_affinity_mask_target (target_type,              &
-                                             target_number,            &
-                                             mask)                     &
-           bind (C, name = "kmp_destroy_affinity_mask_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end subroutine kmp_destroy_affinity_mask_target
-
-function kmp_set_affinity_target (target_type,                         &
-                                  target_number,                       &
-                                  mask)                                &
-           bind (C, name = "kmp_set_affinity_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_set_affinity_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_set_affinity_target
-
-function kmp_get_affinity_target (target_type,                         &
-                                  target_number,                       &
-                                  mask)                                &
-           bind (C, name = "kmp_get_affinity_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_get_affinity_target
-  integer (kind=c_int), value :: target_type, target_number
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_get_affinity_target
-
-function kmp_get_affinity_max_proc_target (target_type,                &
-                                           target_number)              &
-           bind (C, name = "kmp_get_affinity_max_proc_target")
-  import :: c_int
-  integer (kind=c_int)        :: kmp_get_affinity_max_proc_target
-  integer (kind=c_int), value :: target_type, target_number
-end function kmp_get_affinity_max_proc_target
-
-function kmp_set_affinity_mask_proc_target (target_type,               &
-                                            target_number,             &
-                                            proc,                      &
-                                            mask)                      &
-           bind (C, name = "kmp_set_affinity_mask_proc_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_set_affinity_mask_proc_target
-  integer (kind=c_int), value :: target_type, target_number, proc
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_set_affinity_mask_proc_target
-
-function kmp_unset_affinity_mask_proc_target (target_type,             &
-                                              target_number,           &
-                                              proc,                    &
-                                              mask)                    &
-           bind (C, name = "kmp_unset_affinity_mask_proc_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_unset_affinity_mask_proc_target
-  integer (kind=c_int), value :: target_type, target_number, proc
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_unset_affinity_mask_proc_target
-
-function kmp_get_affinity_mask_proc_target (target_type,               &
-                                            target_number,             &
-                                            proc,                      &
-                                            mask)                      &
-           bind (C, name = "kmp_get_affinity_mask_proc_target")
-  import :: c_int, c_intptr_t
-  integer (kind=c_int)        :: kmp_get_affinity_mask_proc_target
-  integer (kind=c_int), value :: target_type, target_number, proc
-  integer (kind=c_intptr_t), value :: mask
-end function kmp_get_affinity_mask_proc_target
-
-end interface
-end module mic_lib
diff --git a/offload/src/offload.h b/offload/src/offload.h
deleted file mode 100644
index 68914b731..000000000
--- a/offload/src/offload.h
+++ /dev/null
@@ -1,474 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*
- * Include file for Offload API.
- */
-
-#ifndef OFFLOAD_H_INCLUDED
-#define OFFLOAD_H_INCLUDED
-
-#if defined(LINUX) || defined(FREEBSD)
-#include <bits/functexcept.h>
-#endif
-
-#include <stddef.h>
-#include <omp.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define TARGET_ATTRIBUTE __declspec(target(mic))
-
-/*
- *  The target architecture.
- */
-typedef enum TARGET_TYPE {
-    TARGET_NONE,    /* Undefine target */
-    TARGET_HOST,    /* Host used as target */
-    TARGET_MIC      /* MIC target */
-} TARGET_TYPE;
-
-/*
- *  The default target type.
- */
-#define DEFAULT_TARGET_TYPE TARGET_MIC
-
-/*
- *  The default target number.
- */
-#define DEFAULT_TARGET_NUMBER 0
-
-/*
- *  Offload status.
- */
-typedef enum {
-    OFFLOAD_SUCCESS = 0,
-    OFFLOAD_DISABLED,               /* offload is disabled */
-    OFFLOAD_UNAVAILABLE,            /* card is not available */
-    OFFLOAD_OUT_OF_MEMORY,          /* not enough memory on device */
-    OFFLOAD_PROCESS_DIED,           /* target process has died */
-    OFFLOAD_ERROR                   /* unspecified error */
-} _Offload_result;
-
-typedef struct {
-    _Offload_result result;         /* result, see above */
-    int             device_number;  /* device number */
-    size_t          data_sent;      /* number of bytes sent to the target */
-    size_t          data_received;  /* number of bytes received by host */
-} _Offload_status;
-
-#define OFFLOAD_STATUS_INIT(x) \
-    ((x).result = OFFLOAD_DISABLED)
-
-#define OFFLOAD_STATUS_INITIALIZER \
-    { OFFLOAD_DISABLED, -1, 0, 0 }
-
-/* Offload runtime interfaces */
-
-extern int _Offload_number_of_devices(void);
-extern int _Offload_get_device_number(void);
-extern int _Offload_get_physical_device_number(void);
-
-extern void* _Offload_shared_malloc(size_t size);
-extern void  _Offload_shared_free(void *ptr);
-
-extern void* _Offload_shared_aligned_malloc(size_t size, size_t align);
-extern void  _Offload_shared_aligned_free(void *ptr);
-
-extern int _Offload_signaled(int index, void *signal);
-extern void _Offload_report(int val);
-
-/* OpenMP API */
-
-extern void omp_set_default_device(int num);
-extern int  omp_get_default_device(void);
-extern int  omp_get_num_devices(void);
-
-/* OpenMP API wrappers */
-
-/* Set num_threads on target */
-extern void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-/* Get max_threads from target */
-extern int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-/* Get num_procs from target */
-extern int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-/* Set dynamic on target */
-extern void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-/* Get dynamic from target */
-extern int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-/* Set nested on target */
-extern void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int nested
-);
-
-/* Get nested from target */
-extern int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-extern int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-extern int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-);
-
-extern int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void omp_set_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t kind,
-    int modifier
-);
-
-extern void omp_get_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t *kind,
-    int *modifier
-);
-
-/* lock API functions */
-
-typedef struct {
-    omp_lock_t lock;
-} omp_lock_target_t;
-
-extern void omp_init_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern void omp_destroy_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern void omp_set_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern void omp_unset_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-extern int omp_test_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-);
-
-/* nested lock API functions */
-
-typedef struct {
-    omp_nest_lock_t lock;
-} omp_nest_lock_target_t;
-
-extern void omp_init_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern void omp_destroy_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern void omp_set_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern void omp_unset_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-extern int omp_test_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-);
-
-/* kmp API functions */
-
-extern void kmp_set_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int size
-);
-
-extern int kmp_get_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    size_t size
-);
-
-extern size_t kmp_get_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int time
-);
-
-extern int kmp_get_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_serial_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_turnaround_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_throughput_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_library_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int mode
-);
-
-extern int kmp_get_library_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern void kmp_set_defaults_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    char const *defaults
-);
-
-/* affinity API functions */
-
-typedef struct {
-    kmp_affinity_mask_t mask;
-} kmp_affinity_mask_target_t;
-
-extern void kmp_create_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern void kmp_destroy_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_set_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_get_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_get_affinity_max_proc_target(
-    TARGET_TYPE target_type,
-    int target_number
-);
-
-extern int kmp_set_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_unset_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-);
-
-extern int kmp_get_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-);
-
-#ifdef __cplusplus
-} /* extern "C" */
-
-/* Namespace for the shared_allocator. */
-namespace __offload {
-  /* This follows the specification for std::allocator. */
-  /* Forward declaration of the class template. */
-  template <typename T>
-  class shared_allocator;
-
-  /* Specialization for shared_allocator<void>. */
-  template <>
-  class shared_allocator<void> {
-  public:
-    typedef void       *pointer;
-    typedef const void *const_pointer;
-    typedef void        value_type;
-    template <class U> struct rebind { typedef shared_allocator<U> other; };
-  };
-
-  /* Definition of shared_allocator<T>. */
-  template <class T>
-  class shared_allocator {
-  public:
-    typedef size_t     size_type;
-    typedef ptrdiff_t  difference_type;
-    typedef T         *pointer;
-    typedef const T   *const_pointer;
-    typedef T         &reference;
-    typedef const T   &const_reference;
-    typedef T          value_type;
-    template <class U> struct rebind { typedef shared_allocator<U> other; };
-    shared_allocator() throw() { }
-    shared_allocator(const shared_allocator&) throw() { }
-    template <class U> shared_allocator(const shared_allocator<U>&) throw() { }
-    ~shared_allocator() throw() { }
-    pointer address(reference x) const { return &x; }
-    const_pointer address(const_reference x) const { return &x; }
-    pointer allocate(
-      size_type, shared_allocator<void>::const_pointer hint = 0);
-    void deallocate(pointer p, size_type n);
-    size_type max_size() const throw() {
-      return size_type(-1)/sizeof(T);
-    } /* max_size */
-    void construct(pointer p, const T& arg) {
-      ::new (p) T(arg);
-    } /* construct */
-    void destroy(pointer p) {
-      p->~T();
-    } /* destroy */
-  };
-
-  /* Definition for allocate. */
-  template <class T>
-  typename shared_allocator<T>::pointer
-  shared_allocator<T>::allocate(shared_allocator<T>::size_type s,
-                                shared_allocator<void>::const_pointer) {
-    /* Allocate from shared memory. */
-    void *ptr = _Offload_shared_malloc(s*sizeof(T));
-    if (ptr == 0) std::__throw_bad_alloc();
-    return static_cast<pointer>(ptr);
-  } /* allocate */
-
-  template <class T>
-  void shared_allocator<T>::deallocate(pointer p,
-                                       shared_allocator<T>::size_type) {
-    /* Free the shared memory. */
-    _Offload_shared_free(p);
-  } /* deallocate */
-
-  template <typename _T1, typename _T2>
-  inline bool operator==(const shared_allocator<_T1> &, 
-                         const shared_allocator<_T2> &) throw() {
-    return true;
-  }  /* operator== */
-
-  template <typename _T1, typename _T2>
-  inline bool operator!=(const shared_allocator<_T1> &, 
-                         const shared_allocator<_T2> &) throw() {
-    return false;
-  }  /* operator!= */
-}  /* __offload */
-#endif /* __cplusplus */
-
-#endif /* OFFLOAD_H_INCLUDED */
diff --git a/offload/src/offload_common.cpp b/offload/src/offload_common.cpp
deleted file mode 100644
index 3681b066b..000000000
--- a/offload/src/offload_common.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if defined(LINUX) || defined(FREEBSD)
-#include <mm_malloc.h>
-#endif
-
-#include "offload_common.h"
-
-// The debug routines
-
-#if OFFLOAD_DEBUG > 0
-
-void __dump_bytes(
-    int trace_level,
-    const void *data,
-    int len
-)
-{
-    if (console_enabled > trace_level) {
-        const uint8_t *arr = (const uint8_t*) data;
-        char buffer[4096];
-        char *bufferp;
-        int count = 0;
-
-        bufferp = buffer;
-        while (len--) {
-            sprintf(bufferp, "%02x", *arr++);
-            bufferp += 2;
-            count++;
-            if ((count&3) == 0) {
-                sprintf(bufferp, " ");
-                bufferp++;
-            }
-            if ((count&63) == 0) {
-                OFFLOAD_DEBUG_TRACE(trace_level, "%s\n", buffer);
-                bufferp = buffer;
-                count = 0;
-            }
-        }
-        if (count) {
-            OFFLOAD_DEBUG_TRACE(trace_level, "%s\n", buffer);
-        }
-    }
-}
-#endif // OFFLOAD_DEBUG
-
-// The Marshaller and associated routines
-
-void Marshaller::send_data(
-    const void *data,
-    int64_t length
-)
-{
-    OFFLOAD_DEBUG_TRACE(2, "send_data(%p, %lld)\n",
-                        data, length);
-    memcpy(buffer_ptr, data, (size_t)length);
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-void Marshaller::receive_data(
-    void *data,
-    int64_t length
-)
-{
-    OFFLOAD_DEBUG_TRACE(2, "receive_data(%p, %lld)\n",
-                        data, length);
-    memcpy(data, buffer_ptr, (size_t)length);
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-// Send function pointer
-void Marshaller::send_func_ptr(
-    const void* data
-)
-{
-    const char* name;
-    size_t      length;
-
-    if (data != 0) {
-        name = __offload_funcs.find_name(data);
-        if (name == 0) {
-#if OFFLOAD_DEBUG > 0
-            if (console_enabled > 2) {
-                __offload_funcs.dump();
-            }
-#endif // OFFLOAD_DEBUG > 0
-
-            LIBOFFLOAD_ERROR(c_send_func_ptr, data);
-            exit(1);
-        }
-        length = strlen(name) + 1;
-    }
-    else {
-        name = "";
-        length = 1;
-    }
-
-    memcpy(buffer_ptr, name, length);
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-// Receive function pointer
-void Marshaller::receive_func_ptr(
-    const void** data
-)
-{
-    const char* name;
-    size_t      length;
-
-    name = (const char*) buffer_ptr;
-    if (name[0] != '\0') {
-        *data = __offload_funcs.find_addr(name);
-        if (*data == 0) {
-#if OFFLOAD_DEBUG > 0
-            if (console_enabled > 2) {
-                __offload_funcs.dump();
-            }
-#endif // OFFLOAD_DEBUG > 0
-
-            LIBOFFLOAD_ERROR(c_receive_func_ptr, name);
-            exit(1);
-        }
-        length = strlen(name) + 1;
-    }
-    else {
-        *data = 0;
-        length = 1;
-    }
-
-    buffer_ptr += length;
-    tfr_size += length;
-}
-
-// End of the Marshaller and associated routines
-
-extern void *OFFLOAD_MALLOC(
-    size_t size,
-    size_t align
-)
-{
-    void *ptr;
-    int   err;
-
-    OFFLOAD_DEBUG_TRACE(2, "%s(%lld, %lld)\n", __func__, size, align);
-
-    if (align < sizeof(void*)) {
-        align = sizeof(void*);
-    }
-
-    ptr = _mm_malloc(size, align);
-    if (ptr == NULL) {
-        LIBOFFLOAD_ERROR(c_offload_malloc, size, align);
-        exit(1);
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "%s returned %p\n", __func__, ptr);
-
-    return ptr;
-}
diff --git a/offload/src/offload_common.h b/offload/src/offload_common.h
deleted file mode 100644
index 11cb8bb90..000000000
--- a/offload/src/offload_common.h
+++ /dev/null
@@ -1,444 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The parts of the runtime library common to host and target
-*/
-
-#ifndef OFFLOAD_COMMON_H_INCLUDED
-#define OFFLOAD_COMMON_H_INCLUDED
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <memory.h>
-
-#include "offload.h"
-#include "offload_table.h"
-#include "offload_trace.h"
-#include "offload_timer.h"
-#include "offload_util.h"
-#include "cean_util.h"
-#include "dv_util.h"
-#include "liboffload_error_codes.h"
-
-#include <stdarg.h>
-
-// The debug routines
-
-// Host console and file logging
-extern int console_enabled;
-extern int offload_report_level;
-
-#define OFFLOAD_DO_TRACE (offload_report_level == 3)
-
-extern const char *prefix;
-extern int offload_number;
-#if !HOST_LIBRARY
-extern int mic_index;
-#endif
-
-#if HOST_LIBRARY
-void Offload_Report_Prolog(OffloadHostTimerData* timer_data);
-void Offload_Report_Epilog(OffloadHostTimerData* timer_data);
-void offload_report_free_data(OffloadHostTimerData * timer_data);
-void Offload_Timer_Print(void);
-
-#ifndef TARGET_WINNT
-#define OFFLOAD_DEBUG_INCR_OFLD_NUM() \
-        __sync_add_and_fetch(&offload_number, 1)
-#else
-#define OFFLOAD_DEBUG_INCR_OFLD_NUM() \
-        _InterlockedIncrement(reinterpret_cast<long*>(&offload_number))
-#endif
-
-#define OFFLOAD_DEBUG_PRINT_TAG_PREFIX() \
-        printf("%s:  ", prefix);
-
-#define OFFLOAD_DEBUG_PRINT_PREFIX() \
-        printf("%s:  ", prefix);
-#else
-#define OFFLOAD_DEBUG_PRINT_PREFIX() \
-        printf("%s%d:  ", prefix, mic_index);
-#endif // HOST_LIBRARY
-
-#define OFFLOAD_TRACE(trace_level, ...)  \
-    if (console_enabled >= trace_level) { \
-        OFFLOAD_DEBUG_PRINT_PREFIX(); \
-        printf(__VA_ARGS__); \
-        fflush(NULL); \
-    }
-
-#if OFFLOAD_DEBUG > 0
-
-#define OFFLOAD_DEBUG_TRACE(level, ...) \
-    OFFLOAD_TRACE(level, __VA_ARGS__)
-
-#define OFFLOAD_REPORT(level, offload_number, stage, ...) \
-    if (OFFLOAD_DO_TRACE) { \
-        offload_stage_print(stage, offload_number, __VA_ARGS__); \
-        fflush(NULL); \
-    }
-
-#define OFFLOAD_DEBUG_TRACE_1(level, offload_number, stage, ...) \
-    if (OFFLOAD_DO_TRACE) { \
-        offload_stage_print(stage, offload_number, __VA_ARGS__); \
-        fflush(NULL); \
-    } \
-    if (!OFFLOAD_DO_TRACE) { \
-        OFFLOAD_TRACE(level, __VA_ARGS__) \
-    }
-
-#define OFFLOAD_DEBUG_DUMP_BYTES(level, a, b) \
-    __dump_bytes(level, a, b)
-
-extern void __dump_bytes(
-    int level,
-    const void *data,
-    int len
-);
-
-#else
-
-#define OFFLOAD_DEBUG_LOG(level, ...)
-#define OFFLOAD_DEBUG_DUMP_BYTES(level, a, b)
-
-#endif
-
-// Runtime interface
-
-#define OFFLOAD_PREFIX(a) __offload_##a
-
-#define OFFLOAD_MALLOC            OFFLOAD_PREFIX(malloc)
-#define OFFLOAD_FREE(a)           _mm_free(a)
-
-// Forward functions
-
-extern void *OFFLOAD_MALLOC(size_t size, size_t align);
-
-// The Marshaller
-
-//! \enum Indicator for the type of entry on an offload item list.
-enum OffloadItemType {
-    c_data =   1,       //!< Plain data
-    c_data_ptr,         //!< Pointer data
-    c_func_ptr,         //!< Function pointer
-    c_void_ptr,         //!< void*
-    c_string_ptr,       //!< C string
-    c_dv,               //!< Dope vector variable
-    c_dv_data,          //!< Dope-vector data
-    c_dv_data_slice,    //!< Dope-vector data's slice
-    c_dv_ptr,           //!< Dope-vector variable pointer
-    c_dv_ptr_data,      //!< Dope-vector pointer data
-    c_dv_ptr_data_slice,//!< Dope-vector pointer data's slice
-    c_cean_var,         //!< CEAN variable
-    c_cean_var_ptr,     //!< Pointer to CEAN variable
-    c_data_ptr_array,   //!< Pointer to data pointer array
-    c_func_ptr_array,   //!< Pointer to function pointer array
-    c_void_ptr_array,   //!< Pointer to void* pointer array
-    c_string_ptr_array  //!< Pointer to char* pointer array
-};
-
-#define VAR_TYPE_IS_PTR(t) ((t) == c_string_ptr || \
-                            (t) == c_data_ptr || \
-                            (t) == c_cean_var_ptr || \
-                            (t) == c_dv_ptr)
-
-#define VAR_TYPE_IS_SCALAR(t) ((t) == c_data || \
-                               (t) == c_void_ptr || \
-                               (t) == c_cean_var || \
-                               (t) == c_dv)
-
-#define VAR_TYPE_IS_DV_DATA(t) ((t) == c_dv_data || \
-                                (t) == c_dv_ptr_data)
-
-#define VAR_TYPE_IS_DV_DATA_SLICE(t) ((t) == c_dv_data_slice || \
-                                      (t) == c_dv_ptr_data_slice)
-
-
-//! \enum Specify direction to copy offloaded variable.
-enum OffloadParameterType {
-    c_parameter_unknown = -1, //!< Unknown clause
-    c_parameter_nocopy,       //!< Variable listed in "nocopy" clause
-    c_parameter_in,           //!< Variable listed in "in" clause
-    c_parameter_out,          //!< Variable listed in "out" clause
-    c_parameter_inout         //!< Variable listed in "inout" clause
-};
-
-//! An Offload Variable descriptor
-struct VarDesc {
-    //! OffloadItemTypes of source and destination
-    union {
-        struct {
-            uint8_t dst : 4; //!< OffloadItemType of destination
-            uint8_t src : 4; //!< OffloadItemType of source
-        };
-        uint8_t bits;
-    } type;
-
-    //! OffloadParameterType that describes direction of data transfer
-    union {
-        struct {
-            uint8_t in  : 1; //!< Set if IN or INOUT
-            uint8_t out : 1; //!< Set if OUT or INOUT
-        };
-        uint8_t bits;
-    } direction;
-
-    uint8_t alloc_if;        //!< alloc_if modifier value
-    uint8_t free_if;         //!< free_if modifier value
-    uint32_t align;          //!< MIC alignment requested for pointer data
-    //! Not used by compiler; set to 0
-    /*! Used by runtime as offset to data from start of MIC buffer */
-    uint32_t mic_offset;
-    //! Flags describing this variable
-    union {
-        struct {
-            //! source variable has persistent storage
-            uint32_t is_static : 1;
-            //! destination variable has persistent storage
-            uint32_t is_static_dstn : 1;
-            //! has length for c_dv && c_dv_ptr
-            uint32_t has_length : 1;
-            //! persisted local scalar is in stack buffer
-            uint32_t is_stack_buf : 1;
-            //! buffer address is sent in data
-            uint32_t sink_addr : 1;
-            //! alloc displacement is sent in data
-            uint32_t alloc_disp : 1;
-            //! source data is noncontiguous
-            uint32_t is_noncont_src : 1;
-            //! destination data is noncontiguous
-            uint32_t is_noncont_dst : 1;
-        };
-        uint32_t bits;
-    } flags;
-    //! Not used by compiler; set to 0
-    /*! Used by runtime as offset to base from data stored in a buffer */
-    int64_t offset;
-    //! Element byte-size of data to be transferred
-    /*! For dope-vector, the size of the dope-vector      */
-    int64_t size;
-    union {
-        //! Set to 0 for array expressions and dope-vectors
-        /*! Set to 1 for scalars                          */
-        /*! Set to value of length modifier for pointers  */
-        int64_t count;
-        //! Displacement not used by compiler
-        int64_t disp;
-    };
-
-    //! This field not used by OpenMP 4.0
-    /*! The alloc section expression in #pragma offload   */
-    union {
-       void *alloc;
-       int64_t ptr_arr_offset;
-    };
-
-    //! This field not used by OpenMP 4.0
-    /*! The into section expression in #pragma offload    */
-    /*! For c_data_ptr_array this is the into ptr array   */
-    void *into;
-
-    //! For an ordinary variable, address of the variable
-    /*! For c_cean_var (C/C++ array expression),
-        pointer to arr_desc, which is an array descriptor. */
-    /*! For c_data_ptr_array (array of data pointers),
-        pointer to ptr_array_descriptor,
-        which is a descriptor for pointer array transfers. */
-    void *ptr;
-};
-
-//! Auxiliary struct used when -g is enabled that holds variable names
-struct VarDesc2 {
-    const char *sname; //!< Source name
-    const char *dname; //!< Destination name (when "into" is used)
-};
-
-/*! When the OffloadItemType is c_data_ptr_array
-    the ptr field of the main descriptor points to this struct.          */
-/*! The type in VarDesc1 merely says c_cean_data_ptr, but the pointer
-    type can be c_data_ptr, c_func_ptr, c_void_ptr, or c_string_ptr.
-    Therefore the actual pointer type is in the flags field of VarDesc3. */
-/*! If flag_align_is_array/flag_alloc_if_is_array/flag_free_if_is_array
-    is 0 then alignment/alloc_if/free_if are specified in VarDesc1.      */
-/*! If flag_align_is_array/flag_alloc_if_is_array/flag_free_if_is_array
-    is 1 then align_array/alloc_if_array/free_if_array specify
-    the set of alignment/alloc_if/free_if values.                        */
-/*! For the other fields, if neither the scalar nor the array flag
-    is set, then that modifier was not specified. If the bits are set
-    they specify which modifier was set and whether it was a
-    scalar or an array expression.                                       */
-struct VarDesc3
-{
-    void *ptr_array;        //!< Pointer to arr_desc of array of pointers
-    void *align_array;      //!< Scalar value or pointer to arr_desc
-    void *alloc_if_array;   //!< Scalar value or pointer to arr_desc
-    void *free_if_array;    //!< Scalar value or pointer to arr_desc
-    void *extent_start;     //!< Scalar value or pointer to arr_desc
-    void *extent_elements;  //!< Scalar value or pointer to arr_desc
-    void *into_start;       //!< Scalar value or pointer to arr_desc
-    void *into_elements;    //!< Scalar value or pointer to arr_desc
-    void *alloc_start;      //!< Scalar value or pointer to arr_desc
-    void *alloc_elements;   //!< Scalar value or pointer to arr_desc
-    /*! Flags that describe the pointer type and whether each field
-        is a scalar value or an array expression.        */
-    /*! First 6 bits are pointer array element type:
-        c_data_ptr, c_func_ptr, c_void_ptr, c_string_ptr */
-    /*! Then single bits specify:                        */
-    /*!     align_array is an array                      */
-    /*!     alloc_if_array is an array                   */
-    /*!     free_if_array is an array                    */
-    /*!     extent_start is a scalar expression          */
-    /*!     extent_start is an array expression          */
-    /*!     extent_elements is a scalar expression       */
-    /*!     extent_elements is an array expression       */
-    /*!     into_start is a scalar expression            */
-    /*!     into_start is an array expression            */
-    /*!     into_elements is a scalar expression         */
-    /*!     into_elements is an array expression         */
-    /*!     alloc_start is a scalar expression           */
-    /*!     alloc_start is an array expression           */
-    /*!     alloc_elements is a scalar expression        */
-    /*!     alloc_elements is an array expression        */
-    uint32_t array_fields;
-};
-const int flag_align_is_array = 6;
-const int flag_alloc_if_is_array = 7;
-const int flag_free_if_is_array = 8;
-const int flag_extent_start_is_scalar = 9;
-const int flag_extent_start_is_array = 10;
-const int flag_extent_elements_is_scalar = 11;
-const int flag_extent_elements_is_array = 12;
-const int flag_into_start_is_scalar = 13;
-const int flag_into_start_is_array = 14;
-const int flag_into_elements_is_scalar = 15;
-const int flag_into_elements_is_array = 16;
-const int flag_alloc_start_is_scalar = 17;
-const int flag_alloc_start_is_array = 18;
-const int flag_alloc_elements_is_scalar = 19;
-const int flag_alloc_elements_is_array = 20;
-
-// The Marshaller
-class Marshaller
-{
-private:
-    // Start address of buffer
-    char *buffer_start;
-
-    // Current pointer within buffer
-    char *buffer_ptr;
-
-    // Physical size of data sent (including flags)
-    long long buffer_size;
-
-    // User data sent/received
-    long long tfr_size;
-
-public:
-    // Constructor
-    Marshaller() :
-        buffer_start(0), buffer_ptr(0),
-        buffer_size(0), tfr_size(0)
-    {
-    }
-
-    // Return count of user data sent/received
-    long long get_tfr_size() const
-    {
-        return tfr_size;
-    }
-
-    // Return pointer to buffer
-    char *get_buffer_start() const
-    {
-        return buffer_start;
-    }
-
-    // Return current size of data in buffer
-    long long get_buffer_size() const
-    {
-        return buffer_size;
-    }
-
-    // Set buffer pointer
-    void init_buffer(
-        char *d,
-        long long s
-    )
-    {
-        buffer_start = buffer_ptr = d;
-        buffer_size = s;
-    }
-
-    // Send data
-    void send_data(
-        const void *data,
-        int64_t length
-    );
-
-    // Receive data
-    void receive_data(
-        void *data,
-        int64_t length
-    );
-
-    // Send function pointer
-    void send_func_ptr(
-        const void* data
-    );
-
-    // Receive function pointer
-    void receive_func_ptr(
-        const void** data
-    );
-};
-
-// End of the Marshaller
-
-// The offloaded function descriptor.
-// Sent from host to target to specify which function to run.
-// Also, sets console and file tracing levels.
-struct FunctionDescriptor
-{
-    // Input data size.
-    long long in_datalen;
-
-    // Output data size.
-    long long out_datalen;
-
-    // Whether trace is requested on console.
-    // A value of 1 produces only function name and data sent/received.
-    // Values > 1 produce copious trace information.
-    uint8_t console_enabled;
-
-    // Flag controlling timing on the target side.
-    // Values > 0 enable timing on sink.
-    uint8_t timer_enabled;
-
-    int offload_report_level;
-    int offload_number;
-
-    // number of variable descriptors
-    int vars_num;
-
-    // inout data offset if data is passed as misc/return data
-    // otherwise it should be zero.
-    int data_offset;
-
-    // The name of the offloaded function
-    char data[];
-};
-
-// typedef OFFLOAD.
-// Pointer to OffloadDescriptor.
-typedef struct OffloadDescriptor *OFFLOAD;
-
-#endif // OFFLOAD_COMMON_H_INCLUDED
diff --git a/offload/src/offload_engine.cpp b/offload/src/offload_engine.cpp
deleted file mode 100644
index 069b604e1..000000000
--- a/offload/src/offload_engine.cpp
+++ /dev/null
@@ -1,531 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_engine.h"
-#include <signal.h>
-#include <errno.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "offload_host.h"
-#include "offload_table.h"
-
-const char* Engine::m_func_names[Engine::c_funcs_total] =
-{
-    "server_compute",
-#ifdef MYO_SUPPORT
-    "server_myoinit",
-    "server_myofini",
-#endif // MYO_SUPPORT
-    "server_init",
-    "server_var_table_size",
-    "server_var_table_copy"
-};
-
-// Symbolic representation of system signals. Fix for CQ233593
-const char* Engine::c_signal_names[Engine::c_signal_max] =
-{
-    "Unknown SIGNAL",
-    "SIGHUP",    /*  1, Hangup (POSIX).  */
-    "SIGINT",    /*  2, Interrupt (ANSI).  */
-    "SIGQUIT",   /*  3, Quit (POSIX).  */
-    "SIGILL",    /*  4, Illegal instruction (ANSI).  */
-    "SIGTRAP",   /*  5, Trace trap (POSIX).  */
-    "SIGABRT",   /*  6, Abort (ANSI).  */
-    "SIGBUS",    /*  7, BUS error (4.2 BSD).  */
-    "SIGFPE",    /*  8, Floating-point exception (ANSI).  */
-    "SIGKILL",   /*  9, Kill, unblockable (POSIX).  */
-    "SIGUSR1",   /* 10, User-defined signal 1 (POSIX).  */
-    "SIGSEGV",   /* 11, Segmentation violation (ANSI).  */
-    "SIGUSR2",   /* 12, User-defined signal 2 (POSIX).  */
-    "SIGPIPE",   /* 13, Broken pipe (POSIX).  */
-    "SIGALRM",   /* 14, Alarm clock (POSIX).  */
-    "SIGTERM",   /* 15, Termination (ANSI).  */
-    "SIGSTKFLT", /* 16, Stack fault.  */
-    "SIGCHLD",   /* 17, Child status has changed (POSIX).  */
-    "SIGCONT",   /* 18, Continue (POSIX).  */
-    "SIGSTOP",   /* 19, Stop, unblockable (POSIX).  */
-    "SIGTSTP",   /* 20, Keyboard stop (POSIX).  */
-    "SIGTTIN",   /* 21, Background read from tty (POSIX).  */
-    "SIGTTOU",   /* 22, Background write to tty (POSIX).  */
-    "SIGURG",    /* 23, Urgent condition on socket (4.2 BSD).  */
-    "SIGXCPU",   /* 24, CPU limit exceeded (4.2 BSD).  */
-    "SIGXFSZ",   /* 25, File size limit exceeded (4.2 BSD).  */
-    "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD).  */
-    "SIGPROF",   /* 27, Profiling alarm clock (4.2 BSD).  */
-    "SIGWINCH",  /* 28, Window size change (4.3 BSD, Sun).  */
-    "SIGIO",     /* 29, I/O now possible (4.2 BSD).  */
-    "SIGPWR",    /* 30, Power failure restart (System V).  */
-    "SIGSYS"     /* 31, Bad system call.  */
-};
-
-void Engine::init(void)
-{
-    if (!m_ready) {
-        mutex_locker_t locker(m_lock);
-
-        if (!m_ready) {
-            // start process if not done yet
-            if (m_process == 0) {
-                init_process();
-            }
-
-            // load penging images
-            load_libraries();
-
-            // and (re)build pointer table
-            init_ptr_data();
-
-            // it is ready now
-            m_ready = true;
-        }
-    }
-}
-
-void Engine::init_process(void)
-{
-    COIENGINE engine;
-    COIRESULT res;
-    const char **environ;
-
-    // create environment for the target process
-    environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
-    if (environ != 0) {
-        for (const char **p = environ; *p != 0; p++) {
-            OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p);
-        }
-    }
-
-    // Create execution context in the specified device
-    OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
-                        m_physical_index);
-    res = COI::EngineGetHandle(COI_ISA_KNC, m_physical_index, &engine);
-    check_result(res, c_get_engine_handle, m_index, res);
-
-    // Target executable should be available by the time when we
-    // attempt to initialize the device
-    if (__target_exe == 0) {
-        LIBOFFLOAD_ERROR(c_no_target_exe);
-        exit(1);
-    }
-
-    OFFLOAD_DEBUG_TRACE(2,
-        "Loading target executable \"%s\" from %p, size %lld\n",
-        __target_exe->name, __target_exe->data, __target_exe->size);
-
-    res = COI::ProcessCreateFromMemory(
-        engine,                 // in_Engine
-        __target_exe->name,     // in_pBinaryName
-        __target_exe->data,     // in_pBinaryBuffer
-        __target_exe->size,     // in_BinaryBufferLength,
-        0,                      // in_Argc
-        0,                      // in_ppArgv
-        environ == 0,           // in_DupEnv
-        environ,                // in_ppAdditionalEnv
-        mic_proxy_io,           // in_ProxyActive
-        mic_proxy_fs_root,      // in_ProxyfsRoot
-        mic_buffer_size,        // in_BufferSpace
-        mic_library_path,       // in_LibrarySearchPath
-        __target_exe->origin,   // in_FileOfOrigin
-        __target_exe->offset,   // in_FileOfOriginOffset
-        &m_process              // out_pProcess
-    );
-    check_result(res, c_process_create, m_index, res);
-
-    // get function handles
-    res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
-                                         m_func_names, m_funcs);
-    check_result(res, c_process_get_func_handles, m_index, res);
-
-    // initialize device side
-    pid_t pid = init_device();
-
-    // For IDB
-    if (__dbg_is_attached) {
-        // TODO: we have in-memory executable now.
-        // Check with IDB team what should we provide them now?
-        if (strlen(__target_exe->name) < MAX_TARGET_NAME) {
-            strcpy(__dbg_target_exe_name, __target_exe->name);
-        }
-        __dbg_target_so_pid = pid;
-        __dbg_target_id = m_physical_index;
-        __dbg_target_so_loaded();
-    }
-}
-
-void Engine::fini_process(bool verbose)
-{
-    if (m_process != 0) {
-        uint32_t sig;
-        int8_t ret;
-
-        // destroy target process
-        OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
-                            m_index);
-
-        COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig);
-        m_process = 0;
-
-        if (res == COI_SUCCESS) {
-            OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
-                                sig, ret);
-            if (verbose) {
-                if (sig != 0) {
-                    LIBOFFLOAD_ERROR(
-                        c_mic_process_exit_sig, m_index, sig,
-                        c_signal_names[sig >= c_signal_max ? 0 : sig]);
-                }
-                else {
-                    LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret);
-                }
-            }
-
-            // for idb
-            if (__dbg_is_attached) {
-                __dbg_target_so_unloaded();
-            }
-        }
-        else {
-            if (verbose) {
-                LIBOFFLOAD_ERROR(c_mic_process_exit, m_index);
-            }
-        }
-    }
-}
-
-void Engine::load_libraries()
-{
-    // load libraries collected so far
-    for (TargetImageList::iterator it = m_images.begin();
-         it != m_images.end(); it++) {
-        OFFLOAD_DEBUG_TRACE(2, "Loading library \"%s\" from %p, size %llu\n",
-                            it->name, it->data, it->size);
-
-        // load library to the device
-        COILIBRARY lib;
-        COIRESULT res;
-        res = COI::ProcessLoadLibraryFromMemory(m_process,
-                                                it->data,
-                                                it->size,
-                                                it->name,
-                                                mic_library_path,
-                                                it->origin,
-                                                it->offset,
-                                                COI_LOADLIBRARY_V1_FLAGS,
-                                                &lib);
-
-        if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
-            check_result(res, c_load_library, m_index, res);
-        }
-    }
-    m_images.clear();
-}
-
-static bool target_entry_cmp(
-    const VarList::BufEntry &l,
-    const VarList::BufEntry &r
-)
-{
-    const char *l_name = reinterpret_cast<const char*>(l.name);
-    const char *r_name = reinterpret_cast<const char*>(r.name);
-    return strcmp(l_name, r_name) < 0;
-}
-
-static bool host_entry_cmp(
-    const VarTable::Entry *l,
-    const VarTable::Entry *r
-)
-{
-    return strcmp(l->name, r->name) < 0;
-}
-
-void Engine::init_ptr_data(void)
-{
-    COIRESULT res;
-    COIEVENT event;
-
-    // Prepare table of host entries
-    std::vector<const VarTable::Entry*> host_table(__offload_vars.begin(),
-                                                   __offload_vars.end());
-
-    // no need to do anything further is host table is empty
-    if (host_table.size() <= 0) {
-        return;
-    }
-
-    // Get var table entries from the target.
-    // First we need to get size for the buffer to copy data
-    struct {
-        int64_t nelems;
-        int64_t length;
-    } params;
-
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_var_table_size],
-                                   0, 0, 0,
-                                   0, 0,
-                                   0, 0,
-                                   &params, sizeof(params),
-                                   &event);
-    check_result(res, c_pipeline_run_func, m_index, res);
-
-    res = COI::EventWait(1, &event, -1, 1, 0, 0);
-    check_result(res, c_event_wait, res);
-
-    if (params.length == 0) {
-        return;
-    }
-
-    // create buffer for target entries and copy data to host
-    COIBUFFER buffer;
-    res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1,
-                            &m_process, &buffer);
-    check_result(res, c_buf_create, m_index, res);
-
-    COI_ACCESS_FLAGS flags = COI_SINK_WRITE;
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_var_table_copy],
-                                   1, &buffer, &flags,
-                                   0, 0,
-                                   &params.nelems, sizeof(params.nelems),
-                                   0, 0,
-                                   &event);
-    check_result(res, c_pipeline_run_func, m_index, res);
-
-    res = COI::EventWait(1, &event, -1, 1, 0, 0);
-    check_result(res, c_event_wait, res);
-
-    // patch names in target data
-    VarList::BufEntry *target_table;
-    COIMAPINSTANCE map_inst;
-    res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0,
-                         0, &map_inst,
-                         reinterpret_cast<void**>(&target_table));
-    check_result(res, c_buf_map, res);
-
-    VarList::table_patch_names(target_table, params.nelems);
-
-    // and sort entries
-    std::sort(target_table, target_table + params.nelems, target_entry_cmp);
-    std::sort(host_table.begin(), host_table.end(), host_entry_cmp);
-
-    // merge host and target entries and enter matching vars map
-    std::vector<const VarTable::Entry*>::const_iterator hi =
-        host_table.begin();
-    std::vector<const VarTable::Entry*>::const_iterator he =
-        host_table.end();
-    const VarList::BufEntry *ti = target_table;
-    const VarList::BufEntry *te = target_table + params.nelems;
-
-    while (hi != he && ti != te) {
-        int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
-        if (res == 0) {
-            // add matching entry to var map
-            std::pair<PtrSet::iterator, bool> res =
-                m_ptr_set.insert(PtrData((*hi)->addr, (*hi)->size));
-
-            // store address for new entries
-            if (res.second) {
-                PtrData *ptr = const_cast<PtrData*>(res.first.operator->());
-                ptr->mic_addr = ti->addr;
-                ptr->is_static = true;
-            }
-
-            hi++;
-            ti++;
-        }
-        else if (res < 0) {
-            hi++;
-        }
-        else {
-            ti++;
-        }
-    }
-
-    // cleanup
-    res = COI::BufferUnmap(map_inst, 0, 0, 0);
-    check_result(res, c_buf_unmap, res);
-
-    res = COI::BufferDestroy(buffer);
-    check_result(res, c_buf_destroy, res);
-}
-
-COIRESULT Engine::compute(
-    const std::list<COIBUFFER> &buffers,
-    const void*         data,
-    uint16_t            data_size,
-    void*               ret,
-    uint16_t            ret_size,
-    uint32_t            num_deps,
-    const COIEVENT*     deps,
-    COIEVENT*           event
-) /* const */
-{
-    COIBUFFER *bufs;
-    COI_ACCESS_FLAGS *flags;
-    COIRESULT res;
-
-    // convert buffers list to array
-    int num_bufs = buffers.size();
-    if (num_bufs > 0) {
-        bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER));
-        flags = (COI_ACCESS_FLAGS*) alloca(num_bufs *
-                                           sizeof(COI_ACCESS_FLAGS));
-
-        int i = 0;
-        for (std::list<COIBUFFER>::const_iterator it = buffers.begin();
-             it != buffers.end(); it++) {
-            bufs[i] = *it;
-
-            // TODO: this should be fixed
-            flags[i++] = COI_SINK_WRITE;
-        }
-    }
-    else {
-        bufs = 0;
-        flags = 0;
-    }
-
-    // start computation
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_compute],
-                                   num_bufs, bufs, flags,
-                                   num_deps, deps,
-                                   data, data_size,
-                                   ret, ret_size,
-                                   event);
-    return res;
-}
-
-pid_t Engine::init_device(void)
-{
-    struct init_data {
-        int  device_index;
-        int  devices_total;
-        int  console_level;
-        int  offload_report_level;
-    } data;
-    COIRESULT res;
-    COIEVENT event;
-    pid_t pid;
-
-    OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init,
-                          "Initializing device with logical index %d "
-                          "and physical index %d\n",
-                           m_index, m_physical_index);
-
-    // setup misc data
-    data.device_index = m_index;
-    data.devices_total = mic_engines_total;
-    data.console_level = console_enabled;
-    data.offload_report_level = offload_report_level;
-
-    res = COI::PipelineRunFunction(get_pipeline(),
-                                   m_funcs[c_func_init],
-                                   0, 0, 0, 0, 0,
-                                   &data, sizeof(data),
-                                   &pid, sizeof(pid),
-                                   &event);
-    check_result(res, c_pipeline_run_func, m_index, res);
-
-    res = COI::EventWait(1, &event, -1, 1, 0, 0);
-    check_result(res, c_event_wait, res);
-
-    OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid);
-
-    return pid;
-}
-
-// data associated with each thread
-struct Thread {
-    Thread(long* addr_coipipe_counter) {
-        m_addr_coipipe_counter = addr_coipipe_counter;
-        memset(m_pipelines, 0, sizeof(m_pipelines));
-    }
-
-    ~Thread() {
-#ifndef TARGET_WINNT
-        __sync_sub_and_fetch(m_addr_coipipe_counter, 1);
-#else // TARGET_WINNT
-        _InterlockedDecrement(m_addr_coipipe_counter);
-#endif // TARGET_WINNT
-        for (int i = 0; i < mic_engines_total; i++) {
-            if (m_pipelines[i] != 0) {
-                COI::PipelineDestroy(m_pipelines[i]);
-            }
-        }
-    }
-
-    COIPIPELINE get_pipeline(int index) const {
-        return m_pipelines[index];
-    }
-
-    void set_pipeline(int index, COIPIPELINE pipeline) {
-        m_pipelines[index] = pipeline;
-    }
-
-    AutoSet& get_auto_vars() {
-        return m_auto_vars;
-    }
-
-private:
-    long*       m_addr_coipipe_counter;
-    AutoSet     m_auto_vars;
-    COIPIPELINE m_pipelines[MIC_ENGINES_MAX];
-};
-
-COIPIPELINE Engine::get_pipeline(void)
-{
-    Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
-    if (thread == 0) {
-        thread = new Thread(&m_proc_number);
-        thread_setspecific(mic_thread_key, thread);
-    }
-
-    COIPIPELINE pipeline = thread->get_pipeline(m_index);
-    if (pipeline == 0) {
-        COIRESULT res;
-        int proc_num;
-
-#ifndef TARGET_WINNT
-        proc_num = __sync_fetch_and_add(&m_proc_number, 1);
-#else // TARGET_WINNT
-        proc_num = _InterlockedIncrement(&m_proc_number);
-#endif // TARGET_WINNT
-
-        if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
-            LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
-            LIBOFFLOAD_ABORT;
-        }
-        // create pipeline for this thread
-        res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
-        check_result(res, c_pipeline_create, m_index, res);
-
-        thread->set_pipeline(m_index, pipeline);
-    }
-    return pipeline;
-}
-
-AutoSet& Engine::get_auto_vars(void)
-{
-    Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
-    if (thread == 0) {
-        thread = new Thread(&m_proc_number);
-        thread_setspecific(mic_thread_key, thread);
-    }
-
-    return thread->get_auto_vars();
-}
-
-void Engine::destroy_thread_data(void *data)
-{
-    delete static_cast<Thread*>(data);
-}
diff --git a/offload/src/offload_engine.h b/offload/src/offload_engine.h
deleted file mode 100644
index d1a963130..000000000
--- a/offload/src/offload_engine.h
+++ /dev/null
@@ -1,482 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_ENGINE_H_INCLUDED
-#define OFFLOAD_ENGINE_H_INCLUDED
-
-#include <limits.h>
-
-#include <list>
-#include <set>
-#include <map>
-#include "offload_common.h"
-#include "coi/coi_client.h"
-
-// Address range
-class MemRange {
-public:
-    MemRange() : m_start(0), m_length(0) {}
-    MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
-
-    const void* start() const {
-        return m_start;
-    }
-
-    const void* end() const {
-        return static_cast<const char*>(m_start) + m_length;
-    }
-
-    uint64_t length() const {
-        return m_length;
-    }
-
-    // returns true if given range overlaps with another one
-    bool overlaps(const MemRange &o) const {
-        // Two address ranges A[start, end) and B[start,end) overlap
-        // if A.start < B.end and A.end > B.start.
-        return start() < o.end() && end() > o.start();
-    }
-
-    // returns true if given range contains the other range
-    bool contains(const MemRange &o) const {
-        return start() <= o.start() && o.end() <= end();
-    }
-
-private:
-    const void* m_start;
-    uint64_t    m_length;
-};
-
-// Data associated with a pointer variable
-class PtrData {
-public:
-    PtrData(const void *addr, uint64_t len) :
-        cpu_addr(addr, len), cpu_buf(0),
-        mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
-        ref_count(0), is_static(false)
-    {}
-
-    //
-    // Copy constructor
-    //
-    PtrData(const PtrData& ptr):
-        cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
-        mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
-        mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
-        ref_count(ptr.ref_count), is_static(ptr.is_static)
-    {}
-
-    bool operator<(const PtrData &o) const {
-        // Variables are sorted by the CPU start address.
-        // Overlapping memory ranges are considered equal.
-        return (cpu_addr.start() < o.cpu_addr.start()) &&
-               !cpu_addr.overlaps(o.cpu_addr);
-    }
-
-    long add_reference() {
-        if (is_static) {
-            return LONG_MAX;
-        }
-#ifndef TARGET_WINNT
-        return __sync_fetch_and_add(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedIncrement(&ref_count) - 1;
-#endif // TARGET_WINNT
-    }
-
-    long remove_reference() {
-        if (is_static) {
-            return LONG_MAX;
-        }
-#ifndef TARGET_WINNT
-        return __sync_sub_and_fetch(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedDecrement(&ref_count);
-#endif // TARGET_WINNT
-    }
-
-    long get_reference() const {
-        if (is_static) {
-            return LONG_MAX;
-        }
-        return ref_count;
-    }
-
-public:
-    // CPU address range
-    const MemRange  cpu_addr;
-
-    // CPU and MIC buffers
-    COIBUFFER       cpu_buf;
-    COIBUFFER       mic_buf;
-
-    // placeholder for buffer address on mic
-    uint64_t        mic_addr;
-
-    uint64_t        alloc_disp;
-
-    // additional offset to pointer data on MIC for improving bandwidth for
-    // data which is not 4K aligned
-    uint32_t        mic_offset;
-
-    // if true buffers are created from static memory
-    bool            is_static;
-    mutex_t         alloc_ptr_data_lock;
-
-private:
-    // reference count for the entry
-    long            ref_count;
-};
-
-typedef std::list<PtrData*> PtrDataList;
-
-// Data associated with automatic variable
-class AutoData {
-public:
-    AutoData(const void *addr, uint64_t len) :
-        cpu_addr(addr, len), ref_count(0)
-    {}
-
-    bool operator<(const AutoData &o) const {
-        // Variables are sorted by the CPU start address.
-        // Overlapping memory ranges are considered equal.
-        return (cpu_addr.start() < o.cpu_addr.start()) &&
-               !cpu_addr.overlaps(o.cpu_addr);
-    }
-
-    long add_reference() {
-#ifndef TARGET_WINNT
-        return __sync_fetch_and_add(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedIncrement(&ref_count) - 1;
-#endif // TARGET_WINNT
-    }
-
-    long remove_reference() {
-#ifndef TARGET_WINNT
-        return __sync_sub_and_fetch(&ref_count, 1);
-#else // TARGET_WINNT
-        return _InterlockedDecrement(&ref_count);
-#endif // TARGET_WINNT
-    }
-
-    long get_reference() const {
-        return ref_count;
-    }
-
-public:
-    // CPU address range
-    const MemRange cpu_addr;
-
-private:
-    // reference count for the entry
-    long ref_count;
-};
-
-// Set of autimatic variables
-typedef std::set<AutoData> AutoSet;
-
-// Target image data
-struct TargetImage
-{
-    TargetImage(const char *_name, const void *_data, uint64_t _size,
-                const char *_origin, uint64_t _offset) :
-        name(_name), data(_data), size(_size),
-        origin(_origin), offset(_offset)
-    {}
-
-    // library name
-    const char* name;
-
-    // contents and size
-    const void* data;
-    uint64_t    size;
-
-    // file of origin and offset within that file
-    const char* origin;
-    uint64_t    offset;
-};
-
-typedef std::list<TargetImage> TargetImageList;
-
-// Data associated with persistent auto objects
-struct PersistData
-{
-    PersistData(const void *addr, uint64_t routine_num, uint64_t size) :
-        stack_cpu_addr(addr), routine_id(routine_num)
-    {
-        stack_ptr_data = new PtrData(0, size);
-    }
-    // 1-st key value - beginning of the stack at CPU
-    const void *   stack_cpu_addr;
-    // 2-nd key value - identifier of routine invocation at CPU
-    uint64_t   routine_id;
-    // corresponded PtrData; only stack_ptr_data->mic_buf is used
-    PtrData * stack_ptr_data;
-    // used to get offset of the variable in stack buffer
-    char * cpu_stack_addr;
-};
-
-typedef std::list<PersistData> PersistDataList;
-
-// class representing a single engine
-struct Engine {
-    friend void __offload_init_library_once(void);
-    friend void __offload_fini_library(void);
-
-#define check_result(res, tag, ...) \
-    { \
-        if (res == COI_PROCESS_DIED) { \
-            fini_process(true); \
-            exit(1); \
-        } \
-        if (res != COI_SUCCESS) { \
-            __liboffload_error_support(tag, __VA_ARGS__); \
-            exit(1); \
-        } \
-    }
-
-    int get_logical_index() const {
-        return m_index;
-    }
-
-    int get_physical_index() const {
-        return m_physical_index;
-    }
-
-    const COIPROCESS& get_process() const {
-        return m_process;
-    }
-
-    // initialize device
-    void init(void);
-
-    // add new library
-    void add_lib(const TargetImage &lib)
-    {
-        m_lock.lock();
-        m_ready = false;
-        m_images.push_back(lib);
-        m_lock.unlock();
-    }
-
-    COIRESULT compute(
-        const std::list<COIBUFFER> &buffers,
-        const void*         data,
-        uint16_t            data_size,
-        void*               ret,
-        uint16_t            ret_size,
-        uint32_t            num_deps,
-        const COIEVENT*     deps,
-        COIEVENT*           event
-    );
-
-#ifdef MYO_SUPPORT
-    // temporary workaround for blocking behavior for myoiLibInit/Fini calls
-    void init_myo(COIEVENT *event) {
-        COIRESULT res;
-        res = COI::PipelineRunFunction(get_pipeline(),
-                                       m_funcs[c_func_myo_init],
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       event);
-        check_result(res, c_pipeline_run_func, m_index, res);
-    }
-
-    void fini_myo(COIEVENT *event) {
-        COIRESULT res;
-        res = COI::PipelineRunFunction(get_pipeline(),
-                                       m_funcs[c_func_myo_fini],
-                                       0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                       event);
-        check_result(res, c_pipeline_run_func, m_index, res);
-    }
-#endif // MYO_SUPPORT
-
-    //
-    // Memory association table
-    //
-    PtrData* find_ptr_data(const void *ptr) {
-        m_ptr_lock.lock();
-        PtrSet::iterator res = m_ptr_set.find(PtrData(ptr, 0));
-        m_ptr_lock.unlock();
-        if (res == m_ptr_set.end()) {
-            return 0;
-        }
-        return const_cast<PtrData*>(res.operator->());
-    }
-
-    PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
-        m_ptr_lock.lock();
-        std::pair<PtrSet::iterator, bool> res =
-            m_ptr_set.insert(PtrData(ptr, len));
-        PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
-        m_ptr_lock.unlock();
-
-        is_new = res.second;
-        if (is_new) {
-            // It's necessary to lock as soon as possible.
-            // unlock must be done at call site of insert_ptr_data at
-            // branch for is_new
-            ptr_data->alloc_ptr_data_lock.lock();
-        }
-        return ptr_data;
-    }
-
-    void remove_ptr_data(const void *ptr) {
-        m_ptr_lock.lock();
-        m_ptr_set.erase(PtrData(ptr, 0));
-        m_ptr_lock.unlock();
-    }
-
-    //
-    // Automatic variables
-    //
-    AutoData* find_auto_data(const void *ptr) {
-        AutoSet &auto_vars = get_auto_vars();
-        AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
-        if (res == auto_vars.end()) {
-            return 0;
-        }
-        return const_cast<AutoData*>(res.operator->());
-    }
-
-    AutoData* insert_auto_data(const void *ptr, uint64_t len) {
-        AutoSet &auto_vars = get_auto_vars();
-        std::pair<AutoSet::iterator, bool> res =
-            auto_vars.insert(AutoData(ptr, len));
-        return const_cast<AutoData*>(res.first.operator->());
-    }
-
-    void remove_auto_data(const void *ptr) {
-        get_auto_vars().erase(AutoData(ptr, 0));
-    }
-
-    //
-    // Signals
-    //
-    void add_signal(const void *signal, OffloadDescriptor *desc) {
-        m_signal_lock.lock();
-        m_signal_map[signal] = desc;
-        m_signal_lock.unlock();
-    }
-
-    OffloadDescriptor* find_signal(const void *signal, bool remove) {
-        OffloadDescriptor *desc = 0;
-
-        m_signal_lock.lock();
-        {
-            SignalMap::iterator it = m_signal_map.find(signal);
-            if (it != m_signal_map.end()) {
-                desc = it->second;
-                if (remove) {
-                    m_signal_map.erase(it);
-                }
-            }
-        }
-        m_signal_lock.unlock();
-
-        return desc;
-    }
-
-    // stop device process
-    void fini_process(bool verbose);
-
-    // list of stacks active at the engine
-    PersistDataList m_persist_list;
-
-private:
-    Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
-               m_proc_number(0)
-    {}
-
-    ~Engine() {
-        if (m_process != 0) {
-            fini_process(false);
-        }
-    }
-
-    // set indexes
-    void set_indexes(int logical_index, int physical_index) {
-        m_index = logical_index;
-        m_physical_index = physical_index;
-    }
-
-    // start process on device
-    void init_process();
-
-    void load_libraries(void);
-    void init_ptr_data(void);
-
-    // performs library intialization on the device side
-    pid_t init_device(void);
-
-private:
-    // get pipeline associated with a calling thread
-    COIPIPELINE get_pipeline(void);
-
-    // get automatic vars set associated with the calling thread
-    AutoSet& get_auto_vars(void);
-
-    // destructor for thread data
-    static void destroy_thread_data(void *data);
-
-private:
-    typedef std::set<PtrData> PtrSet;
-    typedef std::map<const void*, OffloadDescriptor*> SignalMap;
-
-    // device indexes
-    int         m_index;
-    int         m_physical_index;
-
-    // number of COI pipes created for the engine
-    long        m_proc_number;
-
-    // process handle
-    COIPROCESS  m_process;
-
-    // If false, device either has not been initialized or new libraries
-    // have been added.
-    bool        m_ready;
-    mutex_t     m_lock;
-
-    // List of libraries to be loaded
-    TargetImageList m_images;
-
-    // var table
-    PtrSet      m_ptr_set;
-    mutex_t     m_ptr_lock;
-
-    // signals
-    SignalMap m_signal_map;
-    mutex_t   m_signal_lock;
-
-    // constants for accessing device function handles
-    enum {
-        c_func_compute = 0,
-#ifdef MYO_SUPPORT
-        c_func_myo_init,
-        c_func_myo_fini,
-#endif // MYO_SUPPORT
-        c_func_init,
-        c_func_var_table_size,
-        c_func_var_table_copy,
-        c_funcs_total
-    };
-    static const char* m_func_names[c_funcs_total];
-
-    // device function handles
-    COIFUNCTION m_funcs[c_funcs_total];
-
-    // int -> name mapping for device signals
-    static const int   c_signal_max = 32;
-    static const char* c_signal_names[c_signal_max];
-};
-
-#endif // OFFLOAD_ENGINE_H_INCLUDED
diff --git a/offload/src/offload_env.cpp b/offload/src/offload_env.cpp
deleted file mode 100644
index d037338c7..000000000
--- a/offload/src/offload_env.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_env.h"
-#include <string.h>
-#include <ctype.h>
-#include "offload_util.h"
-#include "liboffload_error_codes.h"
-
-// for environment variables valid on all cards
-const int MicEnvVar::any_card = -1;
-
-MicEnvVar::~MicEnvVar()
-{
-    for (std::list<MicEnvVar::CardEnvVars*>::const_iterator
-         it = card_spec_list.begin();
-         it != card_spec_list.end(); it++) {
-        CardEnvVars *card_data = *it;
-        delete card_data;
-    }
-}
-
-MicEnvVar::VarValue::~VarValue()
-{
-    free(env_var_value);
-}
-
-MicEnvVar::CardEnvVars::~CardEnvVars()
-{
-    for (std::list<MicEnvVar::VarValue*>::const_iterator it = env_vars.begin();
-        it != env_vars.end(); it++) {
-            VarValue *var_value = *it;
-            delete var_value;
-    }
-}
-
-// Searching for card in "card_spec_list" list with the same "number"
-
-MicEnvVar::CardEnvVars* MicEnvVar::get_card(int number)
-{
-    if (number == any_card) {
-        return &common_vars;
-    }
-    for (std::list<MicEnvVar::CardEnvVars*>::const_iterator
-         it = card_spec_list.begin();
-         it != card_spec_list.end(); it++) {
-        CardEnvVars *card_data = *it;
-        if (card_data->card_number == number) {
-            return card_data;
-        }
-    }
-    return NULL;
-}
-
-// Searching for environment variable in "env_var" list with the same name
-
-MicEnvVar::VarValue* MicEnvVar::CardEnvVars::find_var(
-    char* env_var_name,
-    int env_var_name_length
-)
-{
-    for (std::list<MicEnvVar::VarValue*>::const_iterator it = env_vars.begin();
-        it != env_vars.end(); it++) {
-            VarValue *var_value = *it;
-            if (var_value->length == env_var_name_length &&
-                !strncmp(var_value->env_var, env_var_name,
-                         env_var_name_length)) {
-                return var_value;
-            }
-    }
-    return NULL;
-}
-
-void MicEnvVar::analyze_env_var(char *env_var_string)
-{
-    char          *env_var_name;
-    char          *env_var_def;
-    int           card_number;
-    int           env_var_name_length;
-    MicEnvVarKind env_var_kind;
-
-    env_var_kind = get_env_var_kind(env_var_string,
-                                    &card_number,
-                                    &env_var_name,
-                                    &env_var_name_length,
-                                    &env_var_def);
-    switch (env_var_kind) {
-        case c_mic_var:
-        case c_mic_card_var:
-            add_env_var(card_number,
-                        env_var_name,
-                        env_var_name_length,
-                        env_var_def);
-            break;
-        case c_mic_card_env:
-            mic_parse_env_var_list(card_number, env_var_def);
-            break;
-        case c_no_mic:
-        default:
-            break;
-    }
-}
-
-void MicEnvVar::add_env_var(
-    int card_number,
-    char *env_var_name,
-    int env_var_name_length,
-    char *env_var_def
-)
-{
-    VarValue *var;
-    CardEnvVars *card;
-
-    // The case corresponds to common env var definition of kind
-    // <mic-prefix>_<var>
-    if (card_number == any_card) {
-        card = &common_vars;
-    }
-    else {
-        card = get_card(card_number);
-        if (!card) {
-            // definition for new card occurred
-            card = new CardEnvVars(card_number);
-            card_spec_list.push_back(card);
-        }
-
-    }
-    var = card->find_var(env_var_name, env_var_name_length);
-    if (!var) {
-        // put new env var definition in "env_var" list
-        var = new VarValue(env_var_name, env_var_name_length, env_var_def);
-        card->env_vars.push_back(var);
-    }
-}
-
-// The routine analyses string pointed by "env_var_string" argument
-// according to the following syntax:
-//
-// Specification of prefix for MIC environment variables
-// MIC_ENV_PREFIX=<mic-prefix>
-//
-// Setting single MIC environment variable
-// <mic-prefix>_<var>=<value>
-// <mic-prefix>_<card-number>_<var>=<value>
-
-// Setting multiple MIC environment variables
-// <mic-prefix>_<card-number>_ENV=<env-vars>
-
-MicEnvVarKind MicEnvVar::get_env_var_kind(
-    char *env_var_string,
-    int *card_number,
-    char **env_var_name,
-    int *env_var_name_length,
-    char **env_var_def
-)
-{
-    int len = strlen(prefix);
-    char *c = env_var_string;
-    int num = 0;
-    bool card_is_set = false;
-
-    if (strncmp(c, prefix, len) != 0 || c[len] != '_') {
-            return c_no_mic;
-    }
-    c += len + 1;
-
-    *card_number = any_card;
-    if (isdigit(*c)) {
-        while (isdigit (*c)) {
-            num = (*c++ - '0') + (num * 10);
-        }
-    if (*c != '_') {
-        return c_no_mic;
-    }
-    c++;
-        *card_number = num;
-        card_is_set = true;
-    }
-    if (!isalpha(*c)) {
-        return c_no_mic;
-    }
-    *env_var_name = *env_var_def = c;
-    if (strncmp(c, "ENV=", 4) == 0) {
-        if (!card_is_set) {
-            *env_var_name_length = 3;
-            *env_var_name = *env_var_def = c;
-            *env_var_def = strdup(*env_var_def);
-            return  c_mic_var;
-        }
-        *env_var_def = c + strlen("ENV=");
-        *env_var_def = strdup(*env_var_def);
-        return c_mic_card_env;
-    }
-    if (isalpha(*c)) {
-        *env_var_name_length = 0;
-        while (isalnum(*c) || *c == '_') {
-            c++;
-            (*env_var_name_length)++;
-        }
-    }
-    if (*c != '=') {
-        return c_no_mic;
-    }
-    *env_var_def = strdup(*env_var_def);
-    return card_is_set? c_mic_card_var : c_mic_var;
-}
-
-// analysing <env-vars> in form:
-// <mic-prefix>_<card-number>_ENV=<env-vars>
-// where:
-//
-// <env-vars>:
-//                <env-var>
-//                <env-vars> | <env-var>
-//
-// <env-var>:
-//                variable=value
-//                variable="value"
-//                variable=
-
-void MicEnvVar::mic_parse_env_var_list(
-    int card_number, char *env_vars_def_list)
-{
-    char *c = env_vars_def_list;
-    char *env_var_name;
-    int  env_var_name_length;
-    char *env_var_def;
-    bool var_is_quoted;
-
-    if (*c == '"') {
-        c++;
-    }
-    while (*c != 0) {
-        var_is_quoted = false;
-        env_var_name = c;
-        env_var_name_length = 0;
-        if (isalpha(*c)) {
-            while (isalnum(*c) || *c == '_') {
-                c++;
-                env_var_name_length++;
-            }
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_mic_parse_env_var_list1);
-            return;
-        }
-        if (*c != '=') {
-            LIBOFFLOAD_ERROR(c_mic_parse_env_var_list2);
-            return;
-        }
-        c++;
-
-        if (*c == '"') {
-            var_is_quoted = true;
-            c++;
-        }
-        // Environment variable values that contain | will need to be escaped.
-        while (*c != 0 && *c != '|' &&
-               (!var_is_quoted || *c != '"'))
-        {
-            // skip escaped symbol
-            if (*c == '\\') {
-                c++;
-            }
-            c++;
-        }
-        if (var_is_quoted) {
-            c++; // for "
-            while (*c != 0 && *c != '|') {
-                c++;
-            }
-        }
-
-        int sz = c - env_var_name;
-        env_var_def = (char*)malloc(sz);
-        memcpy(env_var_def, env_var_name, sz);
-        env_var_def[sz] = 0;
-
-        if (*c == '|') {
-            c++;
-            while (*c != 0 && *c == ' ') {
-                c++;
-            }
-        }
-        add_env_var(card_number,
-                    env_var_name,
-                    env_var_name_length,
-                    env_var_def);
-    }
-}
-
-// Collect all definitions for the card with number "card_num".
-// The returned result is vector of string pointers defining one
-// environment variable. The vector is terminated by NULL pointer.
-// In the beginning of the vector there are env vars defined as
-// <mic-prefix>_<card-number>_<var>=<value>
-// or
-// <mic-prefix>_<card-number>_ENV=<env-vars>
-// where <card-number> is equal to "card_num"
-// They are followed by definitions valid for any card
-// and absent in previous definitions.
-
-char** MicEnvVar::create_environ_for_card(int card_num)
-{
-    VarValue *var_value;
-    VarValue *var_value_find;
-    CardEnvVars *card_data = get_card(card_num);
-    CardEnvVars *card_data_common;
-    std::list<char*> new_env;
-    char **rez;
-
-    if (!prefix) {
-        return NULL;
-    }
-    // There is no personel env var definitions for the card with
-    // number "card_num"
-    if (!card_data) {
-        return create_environ_for_card(any_card);
-    }
-
-    for (std::list<MicEnvVar::VarValue*>::const_iterator
-         it = card_data->env_vars.begin();
-         it != card_data->env_vars.end(); it++) {
-        var_value = *it;
-        new_env.push_back(var_value->env_var_value);
-    }
-
-    if (card_num != any_card) {
-        card_data_common = get_card(any_card);
-        for (std::list<MicEnvVar::VarValue*>::const_iterator
-             it = card_data_common->env_vars.begin();
-             it != card_data_common->env_vars.end(); it++) {
-            var_value = *it;
-            var_value_find = card_data->find_var(var_value->env_var,
-                                                 var_value->length);
-            if (!var_value_find) {
-                new_env.push_back(var_value->env_var_value);
-            }
-        }
-    }
-
-    int new_env_size = new_env.size();
-    rez = (char**) malloc((new_env_size + 1) * sizeof(char*));
-    std::copy(new_env.begin(), new_env.end(), rez);
-    rez[new_env_size] = 0;
-    return rez;
-}
diff --git a/offload/src/offload_env.h b/offload/src/offload_env.h
deleted file mode 100644
index f035ff67b..000000000
--- a/offload/src/offload_env.h
+++ /dev/null
@@ -1,91 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_ENV_H_INCLUDED
-#define OFFLOAD_ENV_H_INCLUDED
-
-#include <list>
-
-// data structure and routines to parse MIC user environment and pass to MIC
-
-enum MicEnvVarKind
-{
-    c_no_mic,         // not MIC env var
-    c_mic_var,        // for <mic-prefix>_<var>
-    c_mic_card_var,   // for <mic-prefix>_<card-number>_<var>
-    c_mic_card_env    // for <mic-prefix>_<card-number>_ENV
-};
-
-struct MicEnvVar {
-public:
-    MicEnvVar() : prefix(0) {}
-    ~MicEnvVar();
-
-    void analyze_env_var(char *env_var_string);
-    char** create_environ_for_card(int card_num);
-    MicEnvVarKind get_env_var_kind(
-        char *env_var_string,
-        int *card_number,
-        char **env_var_name,
-        int *env_var_name_length,
-        char **env_var_def
-    );
-    void add_env_var(
-        int card_number,
-        char *env_var_name,
-        int env_var_name_length,
-        char *env_var_def
-    );
-
-    void set_prefix(const char *pref) {
-        prefix = (pref && *pref != '\0') ? pref : 0;
-    }
-
-    struct VarValue {
-    public:
-        char* env_var;
-        int   length;
-        char* env_var_value;
-
-        VarValue(char* var, int ln, char* value)
-        {
-            env_var = var;
-            length = ln;
-            env_var_value = value;
-        }
-        ~VarValue();
-    };
-
-    struct CardEnvVars {
-    public:
-
-        int card_number;
-        std::list<struct VarValue*> env_vars;
-
-        CardEnvVars() { card_number = any_card; }
-        CardEnvVars(int num) { card_number = num; }
-        ~CardEnvVars();
-
-        void add_new_env_var(int number, char *env_var, int length,
-                             char *env_var_value);
-        VarValue* find_var(char* env_var_name, int env_var_name_length);
-    };
-    static const int any_card;
-
-private:
-    void         mic_parse_env_var_list(int card_number, char *env_var_def);
-    CardEnvVars* get_card(int number);
-
-    const char *prefix;
-    std::list<struct CardEnvVars *> card_spec_list;
-    CardEnvVars common_vars;
-};
-
-#endif // OFFLOAD_ENV_H_INCLUDED
diff --git a/offload/src/offload_host.cpp b/offload/src/offload_host.cpp
deleted file mode 100644
index 38d513921..000000000
--- a/offload/src/offload_host.cpp
+++ /dev/null
@@ -1,4360 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// Forward declaration as the following 2 functions are declared as friend in offload_engine.h
-// CLANG does not like static to been after friend declaration.
-static void __offload_init_library_once(void);
-static void __offload_fini_library(void);
-
-#include "offload_host.h"
-#ifdef MYO_SUPPORT
-#include "offload_myo_host.h"
-#endif
-
-#include <malloc.h>
-#ifndef TARGET_WINNT
-#include <alloca.h>
-#include <elf.h>
-#endif // TARGET_WINNT
-#include <errno.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <algorithm>
-#include <bitset>
-
-#if defined(HOST_WINNT)
-#define PATH_SEPARATOR ";"
-#else
-#define PATH_SEPARATOR ":"
-#endif
-
-#define GET_OFFLOAD_NUMBER(timer_data) \
-    timer_data? timer_data->offload_number : 0
-
-#ifdef TARGET_WINNT
-// Small subset of ELF declarations for Windows which is needed to compile
-// this file. ELF header is used to understand what binary type is contained
-// in the target image - shared library or executable.
-
-typedef uint16_t Elf64_Half;
-typedef uint32_t Elf64_Word;
-typedef uint64_t Elf64_Addr;
-typedef uint64_t Elf64_Off;
-
-#define EI_NIDENT   16
-
-#define ET_EXEC     2
-#define ET_DYN      3
-
-typedef struct
-{
-    unsigned char e_ident[EI_NIDENT];
-    Elf64_Half    e_type;
-    Elf64_Half    e_machine;
-    Elf64_Word    e_version;
-    Elf64_Addr    e_entry;
-    Elf64_Off     e_phoff;
-    Elf64_Off     e_shoff;
-    Elf64_Word    e_flags;
-    Elf64_Half    e_ehsize;
-    Elf64_Half    e_phentsize;
-    Elf64_Half    e_phnum;
-    Elf64_Half    e_shentsize;
-    Elf64_Half    e_shnum;
-    Elf64_Half    e_shstrndx;
-} Elf64_Ehdr;
-#endif // TARGET_WINNT
-
-// Host console and file logging
-const char *prefix;
-int console_enabled = 0;
-int offload_number = 0;
-
-static const char *htrace_envname = "H_TRACE";
-static const char *offload_report_envname = "OFFLOAD_REPORT";
-static char *timer_envname = "H_TIME";
-
-// Trace information
-static const char* vardesc_direction_as_string[] = {
-    "NOCOPY",
-    "IN",
-    "OUT",
-    "INOUT"
-};
-static const char* vardesc_type_as_string[] = {
-    "unknown",
-    "data",
-    "data_ptr",
-    "func_ptr",
-    "void_ptr",
-    "string_ptr",
-    "dv",
-    "dv_data",
-    "dv_data_slice",
-    "dv_ptr",
-    "dv_ptr_data",
-    "dv_ptr_data_slice",
-    "cean_var",
-    "cean_var_ptr",
-    "c_data_ptr_array",
-    "c_func_ptr_array",
-    "c_void_ptr_array",
-    "c_string_ptr_array"
-};
-
-Engine*         mic_engines = 0;
-uint32_t        mic_engines_total = 0;
-pthread_key_t   mic_thread_key;
-MicEnvVar       mic_env_vars;
-uint64_t        cpu_frequency = 0;
-
-// MIC_STACKSIZE
-uint32_t mic_stack_size = 12 * 1024 * 1024;
-
-// MIC_BUFFERSIZE
-uint64_t mic_buffer_size = 0;
-
-// MIC_LD_LIBRARY_PATH
-char* mic_library_path = 0;
-
-// MIC_PROXY_IO
-bool mic_proxy_io = true;
-
-// MIC_PROXY_FS_ROOT
-char* mic_proxy_fs_root = 0;
-
-// Threshold for creating buffers with large pages. Buffer is created
-// with large pages hint if its size exceeds the threshold value.
-// By default large pages are disabled right now (by setting default
-// value for threshold to MAX) due to HSD 4114629.
-uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
-static const char *mic_use_2mb_buffers_envname  =
-    "MIC_USE_2MB_BUFFERS";
-
-static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
-static const char *mic_use_async_buffer_write_envname  =
-    "MIC_USE_ASYNC_BUFFER_WRITE";
-
-static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
-static const char *mic_use_async_buffer_read_envname  =
-    "MIC_USE_ASYNC_BUFFER_READ";
-
-// device initialization type
-OffloadInitType __offload_init_type = c_init_on_offload_all;
-static const char *offload_init_envname = "OFFLOAD_INIT";
-
-// active wait
-static bool __offload_active_wait = true;
-static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
-
-// OMP_DEFAULT_DEVICE
-int __omp_device_num = 0;
-static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
-
-// The list of pending target libraries
-static bool            __target_libs;
-static TargetImageList __target_libs_list;
-static mutex_t         __target_libs_lock;
-static mutex_t         stack_alloc_lock;
-
-// Target executable
-TargetImage*           __target_exe;
-
-static char * offload_get_src_base(void * ptr, uint8_t type)
-{
-    char *base;
-    if (VAR_TYPE_IS_PTR(type)) {
-        base = *static_cast<char**>(ptr);
-    }
-    else if (VAR_TYPE_IS_SCALAR(type)) {
-        base = static_cast<char*>(ptr);
-    }
-    else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
-        ArrDesc *dvp;
-        if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
-            const arr_desc *ap = static_cast<const arr_desc*>(ptr);
-            dvp = (type == c_dv_data_slice) ?
-                  reinterpret_cast<ArrDesc*>(ap->base) :
-                  *reinterpret_cast<ArrDesc**>(ap->base);
-        }
-        else {
-            dvp = (type == c_dv_data) ?
-                  static_cast<ArrDesc*>(ptr) :
-                  *static_cast<ArrDesc**>(ptr);
-        }
-        base = reinterpret_cast<char*>(dvp->Base);
-    }
-    else {
-        base = NULL;
-    }
-    return base;
-}
-
-void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
-{
-    // special case for the 'process died' error
-    if (res == COI_PROCESS_DIED) {
-        m_device.fini_process(true);
-    }
-    else {
-        switch (msg) {
-            case c_buf_create:
-                if (res == COI_OUT_OF_MEMORY) {
-                    msg = c_buf_create_out_of_mem;
-                }
-                /* fallthru */
-
-            case c_buf_create_from_mem:
-            case c_buf_get_address:
-            case c_pipeline_create:
-            case c_pipeline_run_func:
-                LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
-                break;
-
-            case c_buf_read:
-            case c_buf_write:
-            case c_buf_copy:
-            case c_buf_map:
-            case c_buf_unmap:
-            case c_buf_destroy:
-            case c_buf_set_state:
-                LIBOFFLOAD_ERROR(msg, res);
-                break;
-
-            default:
-                break;
-        }
-    }
-
-    exit(1);
-}
-
-_Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
-{
-    switch (res) {
-        case COI_SUCCESS:
-            return OFFLOAD_SUCCESS;
-
-        case COI_PROCESS_DIED:
-            return OFFLOAD_PROCESS_DIED;
-
-        case COI_OUT_OF_MEMORY:
-            return OFFLOAD_OUT_OF_MEMORY;
-
-        default:
-            return OFFLOAD_ERROR;
-    }
-}
-
-bool OffloadDescriptor::alloc_ptr_data(
-    PtrData* &ptr_data,
-    void *base,
-    int64_t disp,
-    int64_t size,
-    int64_t alloc_disp,
-    int align
-)
-{
-    // total length of base
-    int64_t length = disp + size;
-    bool is_new;
-
-    OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
-                  base, length);
-
-    // add new entry
-    ptr_data = m_device.insert_ptr_data(base, length, is_new);
-    if (is_new) {
-
-        OFFLOAD_TRACE(3, "Added new association\n");
-
-        if (length > 0) {
-            OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
-            COIRESULT res;
-
-            // align should be a power of 2
-            if (align > 0 && (align & (align - 1)) == 0) {
-                // offset within mic_buffer. Can do offset optimization
-                // only when source address alignment satisfies requested
-                // alignment on the target (cq172736).
-                if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
-                    ptr_data->mic_offset = reinterpret_cast<intptr_t>(base) & 4095;
-                }
-            }
-
-            // buffer size and flags
-            uint64_t buffer_size = length + ptr_data->mic_offset;
-            uint32_t buffer_flags = 0;
-
-            // create buffer with large pages if data length exceeds
-            // large page threshold
-            if (length >= __offload_use_2mb_buffers) {
-                buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
-            }
-
-            // create CPU buffer
-            OFFLOAD_DEBUG_TRACE_1(3,
-                          GET_OFFLOAD_NUMBER(get_timer_data()),
-                          c_offload_create_buf_host,
-                          "Creating buffer from source memory %p, "
-                          "length %lld\n", base, length);
-
-            // result is not checked because we can continue without cpu
-            // buffer. In this case we will use COIBufferRead/Write instead
-            // of COIBufferCopy.
-            COI::BufferCreateFromMemory(length,
-                                        COI_BUFFER_NORMAL,
-                                        0,
-                                        base,
-                                        1,
-                                        &m_device.get_process(),
-                                        &ptr_data->cpu_buf);
-
-            OFFLOAD_DEBUG_TRACE_1(3,
-                          GET_OFFLOAD_NUMBER(get_timer_data()),
-                          c_offload_create_buf_mic,
-                          "Creating buffer for sink: size %lld, offset %d, "
-                          "flags =0x%x\n", buffer_size - alloc_disp,
-                          ptr_data->mic_offset, buffer_flags);
-
-            // create MIC buffer
-            res = COI::BufferCreate(buffer_size - alloc_disp,
-                                    COI_BUFFER_NORMAL,
-                                    buffer_flags,
-                                    0,
-                                    1,
-                                    &m_device.get_process(),
-                                    &ptr_data->mic_buf);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                }
-                else if (m_is_mandatory) {
-                    report_coi_error(c_buf_create, res);
-                }
-                ptr_data->alloc_ptr_data_lock.unlock();
-                return false;
-            }
-
-            // make buffer valid on the device.
-            res = COI::BufferSetState(ptr_data->mic_buf,
-                                      m_device.get_process(),
-                                      COI_BUFFER_VALID,
-                                      COI_BUFFER_NO_MOVE,
-                                      0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                }
-                else if (m_is_mandatory) {
-                    report_coi_error(c_buf_set_state, res);
-                }
-                ptr_data->alloc_ptr_data_lock.unlock();
-                return false;
-            }
-
-            res = COI::BufferSetState(ptr_data->mic_buf,
-                                      COI_PROCESS_SOURCE,
-                                      COI_BUFFER_INVALID,
-                                      COI_BUFFER_NO_MOVE,
-                                      0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                }
-                else if (m_is_mandatory) {
-                    report_coi_error(c_buf_set_state, res);
-                }
-                ptr_data->alloc_ptr_data_lock.unlock();
-                return false;
-            }
-        }
-
-        ptr_data->alloc_disp = alloc_disp;
-        ptr_data->alloc_ptr_data_lock.unlock();
-    }
-    else {
-        mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
-
-        OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
-                      "is_static %d\n",
-                      ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
-                      ptr_data->is_static);
-
-        // This is not a new entry. Make sure that provided address range fits
-        // into existing one.
-        MemRange addr_range(base, length - ptr_data->alloc_disp);
-        if (!ptr_data->cpu_addr.contains(addr_range)) {
-            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
-            exit(1);
-        }
-
-        // if the entry is associated with static data it may not have buffers
-        // created because they are created on demand.
-        if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::find_ptr_data(
-    PtrData* &ptr_data,
-    void *base,
-    int64_t disp,
-    int64_t size,
-    bool report_error
-)
-{
-    // total length of base
-    int64_t length = disp + size;
-
-    OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
-                  "length %lld\n", base, length);
-
-    // find existing association in pointer table
-    ptr_data = m_device.find_ptr_data(base);
-    if (ptr_data == 0) {
-        if (report_error) {
-            LIBOFFLOAD_ERROR(c_no_ptr_data, base);
-            exit(1);
-        }
-        OFFLOAD_TRACE(3, "Association does not exist\n");
-        return true;
-    }
-
-    OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
-                  ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
-                  ptr_data->is_static);
-
-    // make sure that provided address range fits into existing one
-    MemRange addr_range(base, length);
-    if (!ptr_data->cpu_addr.contains(addr_range)) {
-        if (report_error) {
-            LIBOFFLOAD_ERROR(c_bad_ptr_mem_range);
-            exit(1);
-        }
-        OFFLOAD_TRACE(3, "Existing association partially overlaps with "
-                      "data address range\n");
-        ptr_data = 0;
-        return true;
-    }
-
-    // if the entry is associated with static data it may not have buffers
-    // created because they are created on demand.
-    if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
-        return false;
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
-
-    if (ptr_data->cpu_buf == 0) {
-        OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
-                      ptr_data->cpu_addr.start());
-
-        COIRESULT res = COI::BufferCreateFromMemory(
-            ptr_data->cpu_addr.length(),
-            COI_BUFFER_NORMAL,
-            0,
-            const_cast<void*>(ptr_data->cpu_addr.start()),
-            1, &m_device.get_process(),
-            &ptr_data->cpu_buf);
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_buf_create_from_mem, res);
-        }
-    }
-
-    if (ptr_data->mic_buf == 0) {
-        OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
-                      ptr_data->mic_addr);
-
-        COIRESULT res = COI::BufferCreateFromMemory(
-            ptr_data->cpu_addr.length(),
-            COI_BUFFER_NORMAL,
-            COI_SINK_MEMORY,
-            reinterpret_cast<void*>(ptr_data->mic_addr),
-            1, &m_device.get_process(),
-            &ptr_data->mic_buf);
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_buf_create_from_mem, res);
-        }
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
-{
-    if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
-        COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
-                                                  &ptr_data->mic_addr);
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-            }
-            else if (m_is_mandatory) {
-                report_coi_error(c_buf_get_address, res);
-            }
-            return false;
-        }
-    }
-    return true;
-}
-
-bool OffloadDescriptor::nullify_target_stack(
-    COIBUFFER targ_buf,
-    uint64_t size
-)
-{
-    char * ptr = (char*)malloc(size);
-    COIRESULT res;
-
-    memset(ptr, 0, size);
-    res = COI::BufferWrite(
-        targ_buf,
-        0,
-        ptr,
-        size,
-        COI_COPY_UNSPECIFIED,
-        0, 0, 0);
-    free(ptr);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-            return false;
-        }
-        report_coi_error(c_buf_write, res);
-    }
-    return true;
-}
-
-bool OffloadDescriptor::offload_stack_memory_manager(
-    const void * stack_begin,
-    int  routine_id,
-    int  buf_size,
-    int  align,
-    bool *is_new)
-{
-    mutex_locker_t locker(stack_alloc_lock);
-
-    PersistData * new_el;
-    PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
-    PersistDataList::iterator it_end;
-    int erase = 0;
-
-    *is_new = false;
-
-    for (PersistDataList::iterator it = m_device.m_persist_list.begin();
-        it != m_device.m_persist_list.end(); it++) {
-        PersistData cur_el = *it;
-
-        if (stack_begin > it->stack_cpu_addr) {
-            // this stack data must be destroyed
-            m_destroy_stack.push_front(cur_el.stack_ptr_data);
-            it_end = it;
-            erase++;
-        }
-        else if (stack_begin == it->stack_cpu_addr) {
-            if (routine_id != it-> routine_id) {
-                // this stack data must be destroyed
-                m_destroy_stack.push_front(cur_el.stack_ptr_data);
-                it_end = it;
-                erase++;
-                break;
-            }
-            else {
-                // stack data is reused
-                m_stack_ptr_data = it->stack_ptr_data;
-                if (erase > 0) {
-                    // all obsolete stack sections must be erased from the list
-                    m_device.m_persist_list.erase(it_begin, ++it_end);
-
-                    m_in_datalen +=
-                        erase * sizeof(new_el->stack_ptr_data->mic_addr);
-                }
-                OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
-                                 m_stack_ptr_data->mic_addr);
-                return true;
-            }
-        }
-        else if (stack_begin < it->stack_cpu_addr) {
-            break;
-        }
-    }
-
-    if (erase > 0) {
-        // all obsolete stack sections must be erased from the list
-        m_device.m_persist_list.erase(it_begin, ++it_end);
-        m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
-    }
-    // new stack table is created
-    new_el = new PersistData(stack_begin, routine_id, buf_size);
-    // create MIC buffer
-    COIRESULT res;
-    uint32_t buffer_flags = 0;
-
-    // create buffer with large pages if data length exceeds
-    // large page threshold
-    if (buf_size >= __offload_use_2mb_buffers) {
-        buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
-    }
-    res = COI::BufferCreate(buf_size,
-        COI_BUFFER_NORMAL,
-        buffer_flags,
-        0,
-        1,
-        &m_device.get_process(),
-        &new_el->stack_ptr_data->mic_buf);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-        }
-        else if (m_is_mandatory) {
-            report_coi_error(c_buf_create, res);
-        }
-        return false;
-    }
-    // make buffer valid on the device.
-    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
-        m_device.get_process(),
-        COI_BUFFER_VALID,
-        COI_BUFFER_NO_MOVE,
-        0, 0, 0);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-        }
-        else if (m_is_mandatory) {
-            report_coi_error(c_buf_set_state, res);
-        }
-        return false;
-    }
-    res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
-        COI_PROCESS_SOURCE,
-        COI_BUFFER_INVALID,
-        COI_BUFFER_NO_MOVE,
-        0, 0, 0);
-    if (res != COI_SUCCESS) {
-        if (m_status != 0) {
-            m_status->result = translate_coi_error(res);
-        }
-        else if (m_is_mandatory) {
-            report_coi_error(c_buf_set_state, res);
-        }
-        return false;
-    }
-    // persistence algorithm requires target stack initialy to be nullified
-    if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
-        return false;
-    }
-
-    m_stack_ptr_data = new_el->stack_ptr_data;
-    init_mic_address(m_stack_ptr_data);
-    OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
-                      m_stack_ptr_data->mic_addr);
-    m_device.m_persist_list.push_front(*new_el);
-    init_mic_address(new_el->stack_ptr_data);
-    *is_new = true;
-    return true;
-}
-
-bool OffloadDescriptor::setup_descriptors(
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int vars_total,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    COIRESULT res;
-
-    OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
-
-    // make a copy of variable descriptors
-    m_vars_total = vars_total;
-    if (vars_total > 0) {
-        m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
-        memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
-        m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
-    }
-
-    // dependencies
-    m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * (m_vars_total  + 1));
-    if (m_vars_total > 0) {
-        m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_vars_total);
-    }
-
-    // copyin/copyout data length
-    m_in_datalen = 0;
-    m_out_datalen = 0;
-
-    // First pass over variable descriptors
-    // - Calculate size of the input and output non-pointer data
-    // - Allocate buffers for input and output pointers
-    for (int i = 0; i < m_vars_total; i++) {
-        void*   alloc_base = NULL;
-        int64_t alloc_disp = 0;
-        int64_t alloc_size;
-        bool    src_is_for_mic = (m_vars[i].direction.out ||
-                                  m_vars[i].into == NULL);
-
-        const char *var_sname = "";
-        if (vars2 != NULL && i < vars_total) {
-            if (vars2[i].sname != NULL) {
-                var_sname = vars2[i].sname;
-            }
-        }
-        OFFLOAD_TRACE(2, "   VarDesc %d, var=%s, %s, %s\n",
-            i, var_sname,
-            vardesc_direction_as_string[m_vars[i].direction.bits],
-            vardesc_type_as_string[m_vars[i].type.src]);
-        if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
-            OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
-                vardesc_type_as_string[m_vars[i].type.dst]);
-        }
-        OFFLOAD_TRACE(2,
-            "              type_src=%d, type_dstn=%d, direction=%d, "
-            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
-            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
-            m_vars[i].type.src,
-            m_vars[i].type.dst,
-            m_vars[i].direction.bits,
-            m_vars[i].alloc_if,
-            m_vars[i].free_if,
-            m_vars[i].align,
-            m_vars[i].mic_offset,
-            m_vars[i].flags.bits,
-            m_vars[i].offset,
-            m_vars[i].size,
-            m_vars[i].count,
-            m_vars[i].ptr,
-            m_vars[i].into);
-
-        if (m_vars[i].alloc != NULL) {
-            // array descriptor
-            const arr_desc *ap =
-                static_cast<const arr_desc*>(m_vars[i].alloc);
-
-            // debug dump
-            __arr_desc_dump("    ", "ALLOC", ap, 0);
-
-            __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
-
-            alloc_base = reinterpret_cast<void*>(ap->base);
-        }
-
-        m_vars_extra[i].cpu_disp = 0;
-        m_vars_extra[i].cpu_offset = 0;
-        m_vars_extra[i].src_data = 0;
-        m_vars_extra[i].read_rng_src = 0;
-        m_vars_extra[i].read_rng_dst = 0;
-        // flag is_arr_ptr_el is 1 only for var_descs generated
-        // for c_data_ptr_array type
-        if (i < vars_total) {
-            m_vars_extra[i].is_arr_ptr_el = 0;
-        }
-
-        switch (m_vars[i].type.src) {
-            case c_data_ptr_array:
-                {
-                    const arr_desc *ap;
-                    const VarDesc3 *vd3 =
-                        static_cast<const VarDesc3*>(m_vars[i].ptr);
-                    int flags = vd3->array_fields;
-                    OFFLOAD_TRACE(2,
-                        "              pointer array flags = %04x\n", flags);
-                    OFFLOAD_TRACE(2,
-                        "              pointer array type is %s\n",
-                        vardesc_type_as_string[flags & 0x3f]);
-                    ap = static_cast<const arr_desc*>(vd3->ptr_array);
-                    __arr_desc_dump("              ", "ptr array", ap, 0);
-                    if (m_vars[i].into) {
-                        ap = static_cast<const arr_desc*>(m_vars[i].into);
-                        __arr_desc_dump(
-                            "              ", "into array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_align_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->align_array);
-                        __arr_desc_dump(
-                            "              ", "align array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
-                        __arr_desc_dump(
-                            "              ", "alloc_if array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_free_if_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->free_if_array);
-                        __arr_desc_dump(
-                            "              ", "free_if array", ap, 0);
-                    }
-                    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->extent_start);
-                        __arr_desc_dump(
-                            "              ", "extent_start array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_extent_start_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              extent_start scalar = %d\n",
-                            (int64_t)vd3->extent_start);
-                    }
-                    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>
-                            (vd3->extent_elements);
-                        __arr_desc_dump(
-                            "              ", "extent_elements array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_extent_elements_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              extent_elements scalar = %d\n",
-                            (int64_t)vd3->extent_elements);
-                    }
-                    if ((flags & (1<<flag_into_start_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->into_start);
-                        __arr_desc_dump(
-                            "              ", "into_start array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_into_start_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              into_start scalar = %d\n",
-                            (int64_t)vd3->into_start);
-                    }
-                    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->into_elements);
-                        __arr_desc_dump(
-                            "              ", "into_elements array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_into_elements_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              into_elements scalar = %d\n",
-                            (int64_t)vd3->into_elements);
-                    }
-                    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->alloc_start);
-                        __arr_desc_dump(
-                            "              ", "alloc_start array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_alloc_start_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              alloc_start scalar = %d\n",
-                            (int64_t)vd3->alloc_start);
-                    }
-                    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
-                        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
-                        __arr_desc_dump(
-                            "              ", "alloc_elements array", ap, 0);
-                    } else if ((flags &
-                        (1<<flag_alloc_elements_is_scalar)) != 0) {
-                        OFFLOAD_TRACE(2,
-                            "              alloc_elements scalar = %d\n",
-                            (int64_t)vd3->alloc_elements);
-                    }
-                }
-                if (!gen_var_descs_for_pointer_array(i)) {
-                    return false;
-                }
-                break;
-
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                // In all uses later
-                // VarDesc.size will have the length of the data to be
-                // transferred
-                // VarDesc.disp will have an offset from base
-                if (m_vars[i].type.src == c_cean_var) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                    // debug dump
-                    __arr_desc_dump("", "IN/OUT", ap, 0);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, m_vars[i].disp,
-                                                 m_vars[i].size);
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_src =
-                            init_read_ranges_arr_desc(ap);
-                    }
-                    // all necessary information about length and offset is
-                    // transferred in var descriptor. There is no need to send
-                    // array descriptor to the target side.
-                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
-                }
-                else {
-                    m_vars[i].size *= m_vars[i].count;
-                    m_vars[i].disp = 0;
-                }
-
-                if (m_vars[i].direction.bits) {
-                    // make sure that transfer size > 0
-                    if (m_vars[i].size <= 0) {
-                        LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
-                        exit(1);
-                    }
-
-                    if (m_vars[i].flags.is_static) {
-                        PtrData *ptr_data;
-
-                        // find data associated with variable
-                        if (!find_ptr_data(ptr_data,
-                                           m_vars[i].ptr,
-                                           m_vars[i].disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-
-                        if (ptr_data != 0) {
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            m_vars[i].offset =
-                                (char*) m_vars[i].ptr -
-                                (char*) ptr_data->cpu_addr.start();
-                        }
-                        else {
-                            m_vars[i].flags.is_static = false;
-                            if (m_vars[i].into == NULL) {
-                                m_vars[i].flags.is_static_dstn = false;
-                            }
-                        }
-                        m_vars_extra[i].src_data = ptr_data;
-                    }
-
-                    if (m_is_openmp) {
-                        if (m_vars[i].flags.is_static) {
-                            // Static data is transferred only by omp target
-                            // update construct which passes zeros for
-                            // alloc_if and free_if.
-                            if (m_vars[i].alloc_if || m_vars[i].free_if) {
-                                m_vars[i].direction.bits = c_parameter_nocopy;
-                            }
-                        }
-                        else {
-                            AutoData *auto_data;
-                            if (m_vars[i].alloc_if) {
-                                auto_data = m_device.insert_auto_data(
-                                    m_vars[i].ptr, m_vars[i].size);
-                                auto_data->add_reference();
-                            }
-                            else {
-                                // TODO: what should be done if var is not in
-                                // the table?
-                                auto_data = m_device.find_auto_data(
-                                    m_vars[i].ptr);
-                            }
-
-                            // For automatic variables data is transferred
-                            // only if alloc_if == 0 && free_if == 0
-                            // or reference count is 1
-                            if ((m_vars[i].alloc_if || m_vars[i].free_if) &&
-                                auto_data != 0 &&
-                                auto_data->get_reference() != 1) {
-                                m_vars[i].direction.bits = c_parameter_nocopy;
-                            }
-
-                            // save data for later use
-                            m_vars_extra[i].auto_data = auto_data;
-                        }
-                    }
-
-                    if (m_vars[i].direction.in &&
-                        !m_vars[i].flags.is_static) {
-                        m_in_datalen += m_vars[i].size;
-
-                        // for non-static target destination defined as CEAN
-                        // expression we pass to target its size and dist
-                        if (m_vars[i].into == NULL &&
-                            m_vars[i].type.src == c_cean_var) {
-                            m_in_datalen += 2 * sizeof(uint64_t);
-                        }
-                        m_need_runfunction = true;
-                    }
-                    if (m_vars[i].direction.out &&
-                        !m_vars[i].flags.is_static) {
-                        m_out_datalen += m_vars[i].size;
-                        m_need_runfunction = true;
-                    }
-                }
-                break;
-
-            case c_dv:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
-
-                    // debug dump
-                    __dv_desc_dump("IN/OUT", dvp);
-
-                    // send dope vector contents excluding base
-                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
-                    m_need_runfunction = true;
-                }
-                break;
-
-            case c_string_ptr:
-                if ((m_vars[i].direction.bits ||
-                     m_vars[i].alloc_if ||
-                     m_vars[i].free_if) &&
-                    m_vars[i].size == 0) {
-                    m_vars[i].size = 1;
-                    m_vars[i].count =
-                        strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
-                }
-                /* fallthru */
-
-            case c_data_ptr:
-                if (m_vars[i].flags.is_stack_buf &&
-                    !m_vars[i].direction.bits &&
-                    m_vars[i].alloc_if) {
-                    // this var_desc is for stack buffer
-                    bool is_new;
-
-                    if (!offload_stack_memory_manager(
-                            stack_addr, entry_id,
-                            m_vars[i].count, m_vars[i].align, &is_new)) {
-                        return false;
-                    }
-                    if (is_new) {
-                        m_compute_buffers.push_back(
-                            m_stack_ptr_data->mic_buf);
-                        m_device.m_persist_list.front().cpu_stack_addr =
-                            static_cast<char*>(m_vars[i].ptr);
-                    }
-                    else {
-                        m_vars[i].flags.sink_addr = 1;
-                        m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
-                    }
-                    m_vars[i].size = m_destroy_stack.size();
-                    m_vars_extra[i].src_data = m_stack_ptr_data;
-                    // need to add reference for buffer
-                    m_need_runfunction = true;
-                    break;
-                }
-                /* fallthru */
-
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].type.src == c_cean_var_ptr) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                    // debug dump
-                    __arr_desc_dump("", "IN/OUT", ap, 1);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, m_vars[i].disp,
-                                                 m_vars[i].size);
-
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_src =
-                            init_read_ranges_arr_desc(ap);
-                    }
-                    // all necessary information about length and offset is
-                    // transferred in var descriptor. There is no need to send
-                    // array descriptor to the target side.
-                    m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
-                }
-                else if (m_vars[i].type.src == c_dv_ptr) {
-                    // need to send DV to the device unless it is 'nocopy'
-                    if (m_vars[i].direction.bits ||
-                        m_vars[i].alloc_if ||
-                        m_vars[i].free_if) {
-                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
-
-                        // debug dump
-                        __dv_desc_dump("IN/OUT", dvp);
-
-                        m_vars[i].direction.bits = c_parameter_in;
-                    }
-
-                    // no displacement
-                    m_vars[i].disp = 0;
-                }
-                else {
-                    // c_data_ptr or c_string_ptr
-                    m_vars[i].size *= m_vars[i].count;
-                    m_vars[i].disp = 0;
-                }
-
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    PtrData *ptr_data;
-
-                    // check that buffer length >= 0
-                    if (m_vars[i].alloc_if &&
-                        m_vars[i].disp + m_vars[i].size < 0) {
-                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
-                        exit(1);
-                    }
-
-                    // base address
-                    void *base = *static_cast<void**>(m_vars[i].ptr);
-
-                    // allocate buffer if we have no INTO and don't need
-                    // allocation for the ptr at target
-                    if (src_is_for_mic) {
-                        if (m_vars[i].flags.is_stack_buf) {
-                            // for stack persistent objects ptr data is created
-                            // by var_desc with number 0.
-                            // Its ptr_data is stored at m_stack_ptr_data
-                            ptr_data = m_stack_ptr_data;
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-                        else if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : m_vars[i].disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : m_vars[i].size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf != 0) {
-                                // add buffer to the list of buffers that
-                                // are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            bool error_if_not_found = true;
-                            if (m_is_openmp) {
-                                // For omp target update variable is ignored
-                                // if it does not exist.
-                                if (!m_vars[i].alloc_if &&
-                                    !m_vars[i].free_if) {
-                                    error_if_not_found = false;
-                                }
-                            }
-
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data,
-                                               base,
-                                               m_vars[i].disp,
-                                               m_vars[i].size,
-                                               error_if_not_found)) {
-                                return false;
-                            }
-
-                            if (m_is_openmp) {
-                                // make var nocopy if it does not exist
-                                if (ptr_data == 0) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data != 0) {
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-                        }
-
-                        if (ptr_data != 0) {
-                            if (m_is_openmp) {
-                                // data is transferred only if
-                                // alloc_if == 0 && free_if == 0
-                                // or reference count is 1
-                                if ((m_vars[i].alloc_if ||
-                                     m_vars[i].free_if) &&
-                                    ptr_data->get_reference() != 1) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data->alloc_disp != 0) {
-                                m_vars[i].flags.alloc_disp = 1;
-                                m_in_datalen += sizeof(alloc_disp);
-                            }
-
-                            if (m_vars[i].flags.sink_addr) {
-                                // get buffers's address on the sink
-                                if (!init_mic_address(ptr_data)) {
-                                    return false;
-                                }
-
-                                m_in_datalen += sizeof(ptr_data->mic_addr);
-                            }
-
-                            if (!ptr_data->is_static && m_vars[i].free_if) {
-                                // need to decrement buffer reference on target
-                                m_need_runfunction = true;
-                            }
-
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            m_vars[i].offset = (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-
-                            // copy other pointer properties to var descriptor
-                            m_vars[i].mic_offset = ptr_data->mic_offset;
-                            m_vars[i].flags.is_static = ptr_data->is_static;
-                        }
-                    }
-                    else {
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           m_vars[i].disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-                        if (ptr_data) {
-                            m_vars[i].offset =
-                                (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-                        }
-                    }
-
-                    // save pointer data
-                    m_vars_extra[i].src_data = ptr_data;
-                }
-                break;
-
-            case c_func_ptr:
-                if (m_vars[i].direction.in) {
-                    m_in_datalen += __offload_funcs.max_name_length();
-                }
-                if (m_vars[i].direction.out) {
-                    m_out_datalen += __offload_funcs.max_name_length();
-                }
-                m_need_runfunction = true;
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                ArrDesc *dvp;
-                if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
-                    const arr_desc *ap;
-                    ap = static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                    dvp = (m_vars[i].type.src == c_dv_data_slice) ?
-                          reinterpret_cast<ArrDesc*>(ap->base) :
-                          *reinterpret_cast<ArrDesc**>(ap->base);
-                }
-                else {
-                    dvp = (m_vars[i].type.src == c_dv_data) ?
-                          static_cast<ArrDesc*>(m_vars[i].ptr) :
-                          *static_cast<ArrDesc**>(m_vars[i].ptr);
-                }
-
-                // if allocatable dope vector isn't allocated don't
-                // transfer its data
-                if (!__dv_is_allocated(dvp)) {
-                    m_vars[i].direction.bits = c_parameter_nocopy;
-                    m_vars[i].alloc_if = 0;
-                    m_vars[i].free_if = 0;
-                }
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    const arr_desc *ap;
-
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
-                        ap = static_cast<const arr_desc*>(m_vars[i].ptr);
-
-                        // debug dump
-                        __arr_desc_dump("", "IN/OUT", ap, 0);
-                    }
-                    if (!__dv_is_contiguous(dvp)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_src =
-                            init_read_ranges_dv(dvp);
-                    }
-
-                    // size and displacement
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src)) {
-                        // offset and length are derived from the
-                        // array descriptor
-                        __arr_data_offset_and_length(ap,
-                                                     m_vars[i].disp,
-                                                     m_vars[i].size);
-                        if (m_vars[i].direction.bits) {
-                            if (!is_arr_desc_contiguous(ap)) {
-                                if (m_vars[i].flags.is_noncont_src) {
-                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
-                                    return false;
-                                }
-                                m_vars[i].flags.is_noncont_src = 1;
-                                m_vars_extra[i].read_rng_src =
-                                    init_read_ranges_arr_desc(ap);
-                            }
-                        }
-                    }
-                    else {
-                        if (m_vars[i].flags.has_length) {
-                            m_vars[i].size =
-                                __dv_data_length(dvp, m_vars[i].count);
-                        }
-                        else {
-                            m_vars[i].size = __dv_data_length(dvp);
-                        }
-                        m_vars[i].disp = 0;
-                    }
-
-                    // check that length >= 0
-                    if (m_vars[i].alloc_if &&
-                        (m_vars[i].disp + m_vars[i].size < 0)) {
-                        LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
-                        exit(1);
-                    }
-
-                    // base address
-                    void *base = reinterpret_cast<void*>(dvp->Base);
-                    PtrData *ptr_data;
-
-                    // allocate buffer if we have no INTO and don't need
-                    // allocation for the ptr at target
-                    if (src_is_for_mic) {
-                        if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : m_vars[i].disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : m_vars[i].size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf != 0) {
-                                // add buffer to the list of buffers
-                                // that are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            bool error_if_not_found = true;
-                            if (m_is_openmp) {
-                                // For omp target update variable is ignored
-                                // if it does not exist.
-                                if (!m_vars[i].alloc_if &&
-                                    !m_vars[i].free_if) {
-                                    error_if_not_found = false;
-                                }
-                            }
-
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data,
-                                               base,
-                                               m_vars[i].disp,
-                                               m_vars[i].size,
-                                               error_if_not_found)) {
-                                return false;
-                            }
-
-                            if (m_is_openmp) {
-                                // make var nocopy if it does not exist
-                                if (ptr_data == 0) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data != 0) {
-                                // need to update base in dope vector on device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-                        }
-
-                        if (ptr_data != 0) {
-                            if (m_is_openmp) {
-                                // data is transferred only if
-                                // alloc_if == 0 && free_if == 0
-                                // or reference count is 1
-                                if ((m_vars[i].alloc_if ||
-                                     m_vars[i].free_if) &&
-                                    ptr_data->get_reference() != 1) {
-                                    m_vars[i].direction.bits =
-                                        c_parameter_nocopy;
-                                }
-                            }
-
-                            if (ptr_data->alloc_disp != 0) {
-                                m_vars[i].flags.alloc_disp = 1;
-                                m_in_datalen += sizeof(alloc_disp);
-                            }
-
-                            if (m_vars[i].flags.sink_addr) {
-                                // get buffers's address on the sink
-                                if (!init_mic_address(ptr_data)) {
-                                    return false;
-                                }
-
-                                m_in_datalen += sizeof(ptr_data->mic_addr);
-                            }
-
-                            if (!ptr_data->is_static && m_vars[i].free_if) {
-                                // need to decrement buffer reference on target
-                                m_need_runfunction = true;
-                            }
-
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            m_vars[i].offset =
-                                (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-
-                            // copy other pointer properties to var descriptor
-                            m_vars[i].mic_offset = ptr_data->mic_offset;
-                            m_vars[i].flags.is_static = ptr_data->is_static;
-                        }
-                    }
-                    else { // !src_is_for_mic
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           m_vars[i].disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-                        m_vars[i].offset = !ptr_data ? 0 :
-                                (char*) base -
-                                (char*) ptr_data->cpu_addr.start();
-                    }
-
-                    // save pointer data
-                    m_vars_extra[i].src_data = ptr_data;
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
-                LIBOFFLOAD_ABORT;
-        }
-        if (m_vars[i].type.src == c_data_ptr_array) {
-            continue;
-        }
-
-        if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
-            m_vars[i].offset = static_cast<char*>(m_vars[i].ptr) -
-                m_device.m_persist_list.front().cpu_stack_addr;
-        }
-        // if source is used at CPU save its offset and disp
-        if (m_vars[i].into == NULL || m_vars[i].direction.in) {
-            m_vars_extra[i].cpu_offset = m_vars[i].offset;
-            m_vars_extra[i].cpu_disp   = m_vars[i].disp;
-        }
-
-        // If "into" is define we need to do the similar work for it
-        if (!m_vars[i].into) {
-            continue;
-        }
-
-        int64_t into_disp =0, into_offset = 0;
-
-        switch (m_vars[i].type.dst) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var: {
-                int64_t size = m_vars[i].size;
-
-                if (m_vars[i].type.dst == c_cean_var) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].into);
-
-                    // debug dump
-                    __arr_desc_dump("    ", "INTO", ap, 0);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, into_disp, size);
-
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_dst = 1;
-                        m_vars_extra[i].read_rng_dst =
-                            init_read_ranges_arr_desc(ap);
-                        if (!cean_ranges_match(
-                            m_vars_extra[i].read_rng_src,
-                            m_vars_extra[i].read_rng_dst)) {
-                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
-                            exit(1);
-                        }
-                    }
-                    m_vars[i].into = reinterpret_cast<void*>(ap->base);
-                }
-
-                int64_t size_src = m_vars_extra[i].read_rng_src ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
-                    m_vars[i].size;
-                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
-                    size;
-                // It's supposed that "into" size must be not less
-                // than src size
-                if (size_src > size_dst) {
-                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
-                                     size_src, size_dst);
-                    exit(1);
-                }
-
-                if (m_vars[i].direction.bits) {
-                    if (m_vars[i].flags.is_static_dstn) {
-                        PtrData *ptr_data;
-
-                        // find data associated with variable
-                        if (!find_ptr_data(ptr_data, m_vars[i].into,
-                                           into_disp, size, false)) {
-                            return false;
-                        }
-                        if (ptr_data != 0) {
-                            // offset to base from the beginning of the buffer
-                            // memory
-                            into_offset =
-                                (char*) m_vars[i].into -
-                                (char*) ptr_data->cpu_addr.start();
-                        }
-                        else {
-                            m_vars[i].flags.is_static_dstn = false;
-                        }
-                        m_vars_extra[i].dst_data = ptr_data;
-                    }
-                }
-
-                if (m_vars[i].direction.in &&
-                    !m_vars[i].flags.is_static_dstn) {
-                    m_in_datalen += m_vars[i].size;
-
-                    // for non-static target destination defined as CEAN
-                    // expression we pass to target its size and dist
-                    if (m_vars[i].type.dst == c_cean_var) {
-                        m_in_datalen += 2 * sizeof(uint64_t);
-                    }
-                    m_need_runfunction = true;
-                }
-                break;
-            }
-
-            case c_dv:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
-
-                    // debug dump
-                    __dv_desc_dump("INTO", dvp);
-
-                    // send dope vector contents excluding base
-                    m_in_datalen += m_vars[i].size - sizeof(uint64_t);
-                    m_need_runfunction = true;
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr: {
-                int64_t size = m_vars[i].size;
-
-                if (m_vars[i].type.dst == c_cean_var_ptr) {
-                    // array descriptor
-                    const arr_desc *ap =
-                        static_cast<const arr_desc*>(m_vars[i].into);
-
-                    // debug dump
-                    __arr_desc_dump("    ", "INTO", ap, 1);
-
-                    // offset and length are derived from the array descriptor
-                    __arr_data_offset_and_length(ap, into_disp, size);
-
-                    if (!is_arr_desc_contiguous(ap)) {
-                        m_vars[i].flags.is_noncont_src = 1;
-                        m_vars_extra[i].read_rng_dst =
-                            init_read_ranges_arr_desc(ap);
-                        if (!cean_ranges_match(
-                            m_vars_extra[i].read_rng_src,
-                            m_vars_extra[i].read_rng_dst)) {
-                            LIBOFFLOAD_ERROR(c_ranges_dont_match);
-                        }
-                    }
-                    m_vars[i].into = reinterpret_cast<char**>(ap->base);
-                }
-                else if (m_vars[i].type.dst == c_dv_ptr) {
-                    // need to send DV to the device unless it is 'nocopy'
-                    if (m_vars[i].direction.bits ||
-                        m_vars[i].alloc_if ||
-                        m_vars[i].free_if) {
-                        ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
-
-                        // debug dump
-                        __dv_desc_dump("INTO", dvp);
-
-                        m_vars[i].direction.bits = c_parameter_in;
-                    }
-                }
-
-                int64_t size_src = m_vars_extra[i].read_rng_src ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_src) :
-                    m_vars[i].size;
-                int64_t size_dst = m_vars_extra[i].read_rng_dst ?
-                    cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
-                    size;
-                // It's supposed that "into" size must be not less than
-                // src size
-                if (size_src > size_dst) {
-                    LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
-                                     size_src, size_dst);
-                    exit(1);
-                }
-
-                if (m_vars[i].direction.bits) {
-                    PtrData *ptr_data;
-
-                    // base address
-                    void *base = *static_cast<void**>(m_vars[i].into);
-
-                    if (m_vars[i].direction.in) {
-                        // allocate buffer
-                        if (m_vars[i].flags.is_stack_buf) {
-                            // for stack persistent objects ptr data is created
-                            // by var_desc with number 0.
-                            // Its ptr_data is stored at m_stack_ptr_data
-                            ptr_data = m_stack_ptr_data;
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-                        else if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : into_disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf != 0) {
-                                // add buffer to the list of buffers that
-                                // are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
-                                return false;
-                            }
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-
-                        if (ptr_data->alloc_disp != 0) {
-                            m_vars[i].flags.alloc_disp = 1;
-                            m_in_datalen += sizeof(alloc_disp);
-                        }
-
-                        if (m_vars[i].flags.sink_addr) {
-                            // get buffers's address on the sink
-                            if (!init_mic_address(ptr_data)) {
-                                return false;
-                            }
-
-                            m_in_datalen += sizeof(ptr_data->mic_addr);
-                        }
-
-                        if (!ptr_data->is_static && m_vars[i].free_if) {
-                            // need to decrement buffer reference on target
-                            m_need_runfunction = true;
-                        }
-
-                        // copy other pointer properties to var descriptor
-                        m_vars[i].mic_offset = ptr_data->mic_offset;
-                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
-                    }
-                    else {
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           into_disp,
-                                           m_vars[i].size,
-                                           false)) {
-                            return false;
-                        }
-                    }
-                    if (ptr_data) {
-                        into_offset = ptr_data ?
-                            (char*) base -
-                            (char*) ptr_data->cpu_addr.start() :
-                            0;
-                    }
-                    // save pointer data
-                    m_vars_extra[i].dst_data = ptr_data;
-                }
-                break;
-            }
-
-            case c_func_ptr:
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    const arr_desc *ap;
-                    ArrDesc *dvp;
-                    PtrData *ptr_data;
-                    int64_t disp;
-                    int64_t size;
-
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
-                        ap = static_cast<const arr_desc*>(m_vars[i].into);
-
-                        // debug dump
-                        __arr_desc_dump("    ", "INTO", ap, 0);
-
-                        dvp = (m_vars[i].type.dst == c_dv_data_slice) ?
-                              reinterpret_cast<ArrDesc*>(ap->base) :
-                              *reinterpret_cast<ArrDesc**>(ap->base);
-                    }
-                    else {
-                        dvp = (m_vars[i].type.dst == c_dv_data) ?
-                              static_cast<ArrDesc*>(m_vars[i].into) :
-                              *static_cast<ArrDesc**>(m_vars[i].into);
-                    }
-                    if (!__dv_is_contiguous(dvp)) {
-                        m_vars[i].flags.is_noncont_dst = 1;
-                        m_vars_extra[i].read_rng_dst =
-                            init_read_ranges_dv(dvp);
-                    }
-                    // size and displacement
-                    if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
-                        // offset and length are derived from the array
-                        // descriptor
-                        __arr_data_offset_and_length(ap, into_disp, size);
-                        if (m_vars[i].direction.bits) {
-                            if (!is_arr_desc_contiguous(ap)) {
-                                if (m_vars[i].flags.is_noncont_dst) {
-                                    LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
-                                    return false;
-                                }
-                                m_vars[i].flags.is_noncont_dst = 1;
-                                m_vars_extra[i].read_rng_dst =
-                                    init_read_ranges_arr_desc(ap);
-                                if (!cean_ranges_match(
-                                    m_vars_extra[i].read_rng_src,
-                                    m_vars_extra[i].read_rng_dst)) {
-                                    LIBOFFLOAD_ERROR(c_ranges_dont_match);
-                                }
-                            }
-                        }
-                    }
-                    else {
-                        if (m_vars[i].flags.has_length) {
-                            size = __dv_data_length(dvp, m_vars[i].count);
-                        }
-                        else {
-                            size = __dv_data_length(dvp);
-                        }
-                        disp = 0;
-                    }
-
-                    int64_t size_src =
-                        m_vars_extra[i].read_rng_src ?
-                        cean_get_transf_size(m_vars_extra[i].read_rng_src) :
-                        m_vars[i].size;
-                    int64_t size_dst =
-                        m_vars_extra[i].read_rng_dst ?
-                        cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
-                        size;
-                    // It's supposed that "into" size must be not less
-                    // than src size
-                    if (size_src > size_dst) {
-                        LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
-                            size_src, size_dst);
-                        exit(1);
-                    }
-
-                    // base address
-                    void *base = reinterpret_cast<void*>(dvp->Base);
-
-                    // allocate buffer
-                    if (m_vars[i].direction.in) {
-                        if (m_vars[i].alloc_if) {
-                            // add new entry
-                            if (!alloc_ptr_data(
-                                    ptr_data,
-                                    base,
-                                    (alloc_base != NULL) ?
-                                        alloc_disp : into_disp,
-                                    (alloc_base != NULL) ?
-                                        alloc_size : size,
-                                    alloc_disp,
-                                    (alloc_base != NULL) ?
-                                        0 : m_vars[i].align)) {
-                                return false;
-                            }
-                            if (ptr_data->add_reference() == 0 &&
-                                ptr_data->mic_buf !=0) {
-                                // add buffer to the list of buffers
-                                // that are passed to dispatch call
-                                m_compute_buffers.push_back(
-                                    ptr_data->mic_buf);
-                            }
-                            else {
-                                // will send buffer address to device
-                                m_vars[i].flags.sink_addr = 1;
-                            }
-
-                            if (!ptr_data->is_static) {
-                                // need to add reference for buffer
-                                m_need_runfunction = true;
-                            }
-                        }
-                        else {
-                            // use existing association from pointer table
-                            if (!find_ptr_data(ptr_data, base, into_disp, size)) {
-                                return false;
-                            }
-
-                            // need to update base in dope vector on device
-                            m_vars[i].flags.sink_addr = 1;
-                        }
-
-                        if (ptr_data->alloc_disp != 0) {
-                            m_vars[i].flags.alloc_disp = 1;
-                            m_in_datalen += sizeof(alloc_disp);
-                        }
-
-                        if (m_vars[i].flags.sink_addr) {
-                            // get buffers's address on the sink
-                            if (!init_mic_address(ptr_data)) {
-                                return false;
-                            }
-                            m_in_datalen += sizeof(ptr_data->mic_addr);
-                        }
-
-                        if (!ptr_data->is_static && m_vars[i].free_if) {
-                            // need to decrement buffer reference on target
-                            m_need_runfunction = true;
-                        }
-
-                        // offset to base from the beginning of the buffer
-                        // memory
-                        into_offset =
-                            (char*) base - (char*) ptr_data->cpu_addr.start();
-
-                        // copy other pointer properties to var descriptor
-                        m_vars[i].mic_offset = ptr_data->mic_offset;
-                        m_vars[i].flags.is_static_dstn = ptr_data->is_static;
-                    }
-                    else { // src_is_for_mic
-                        if (!find_ptr_data(ptr_data,
-                                           base,
-                                           into_disp,
-                                           size,
-                                           false)) {
-                            return false;
-                        }
-                        into_offset = !ptr_data ?
-                            0 :
-                            (char*) base - (char*) ptr_data->cpu_addr.start();
-                    }
-
-                    // save pointer data
-                    m_vars_extra[i].dst_data = ptr_data;
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.src);
-                LIBOFFLOAD_ABORT;
-        }
-        // if into is used at CPU save its offset and disp
-        if (m_vars[i].direction.out) {
-            m_vars_extra[i].cpu_offset = into_offset;
-            m_vars_extra[i].cpu_disp   = into_disp;
-        }
-        else {
-            if (m_vars[i].flags.is_stack_buf) {
-                into_offset = static_cast<char*>(m_vars[i].into) -
-                    m_device.m_persist_list.front().cpu_stack_addr;
-            }
-            m_vars[i].offset = into_offset;
-            m_vars[i].disp   = into_disp;
-        }
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::setup_misc_data(const char *name)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
-
-    // we can skip run functon call together with wait if offloaded
-    // region is empty and there is no user defined non-pointer IN/OUT data
-    if (m_need_runfunction) {
-        // variable descriptors are sent as input data
-        m_in_datalen += m_vars_total * sizeof(VarDesc);
-
-        // timer data is sent as a part of the output data
-        m_out_datalen += OFFLOAD_TIMER_DATALEN();
-
-        // max from input data and output data length
-        uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
-                                                           m_out_datalen;
-
-        // Misc data has the following layout
-        //     <Function Descriptor>
-        //     <Function Name>
-        //     <In/Out Data>            (optional)
-        //
-        // We can transfer copyin/copyout data in misc/return data which can
-        // be passed to run function call if its size does not exceed
-        // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
-        // buffer for it.
-
-        m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
-        m_func_desc_size = (m_func_desc_size + 7) & ~7;
-
-        int misc_data_offset = 0;
-        int misc_data_size = 0;
-        if (data_len > 0) {
-            if (m_func_desc_size +
-                m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
-                m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
-                // use misc/return data for copyin/copyout
-                misc_data_offset = m_func_desc_size;
-                misc_data_size = data_len;
-            }
-            else {
-                OffloadTimer timer_buf(get_timer_data(),
-                                       c_offload_host_alloc_data_buffer);
-
-                // send/receive data using buffer
-                COIRESULT res = COI::BufferCreate(data_len,
-                                                  COI_BUFFER_NORMAL,
-                                                  0, 0,
-                                                  1, &m_device.get_process(),
-                                                  &m_inout_buf);
-                if (res != COI_SUCCESS) {
-                    if (m_status != 0) {
-                        m_status->result = translate_coi_error(res);
-                        return false;
-                    }
-                    report_coi_error(c_buf_create, res);
-                }
-
-                m_compute_buffers.push_back(m_inout_buf);
-                m_destroy_buffers.push_back(m_inout_buf);
-            }
-        }
-
-        // initialize function descriptor
-        m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
-                                                   misc_data_size);
-        m_func_desc->console_enabled = console_enabled;
-        m_func_desc->timer_enabled =
-            timer_enabled || (offload_report_level && offload_report_enabled);
-        m_func_desc->offload_report_level = offload_report_level;
-        m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
-        m_func_desc->in_datalen = m_in_datalen;
-        m_func_desc->out_datalen = m_out_datalen;
-        m_func_desc->vars_num = m_vars_total;
-        m_func_desc->data_offset = misc_data_offset;
-
-        // append entry name
-        strcpy(m_func_desc->data, name);
-    }
-
-    return true;
-}
-
-bool OffloadDescriptor::wait_dependencies(
-    const void **waits,
-    int num_waits
-)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
-    bool ret = true;
-
-    for (int i = 0; i < num_waits; i++) {
-
-        OffloadDescriptor *task = m_device.find_signal(waits[i], true);
-        if (task == 0) {
-            LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
-                             waits[i]);
-            LIBOFFLOAD_ABORT;
-        }
-
-        if (!task->offload_finish()) {
-            ret = false;
-        }
-
-        task->cleanup();
-        delete task;
-    }
-
-    return ret;
-}
-
-bool OffloadDescriptor::offload(
-    const char *name,
-    bool is_empty,
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int vars_total,
-    const void **waits,
-    int num_waits,
-    const void **signal,
-    int entry_id,
-    const void *stack_addr
-)
-{
-    if (signal == 0) {
-        OFFLOAD_DEBUG_TRACE_1(1,
-                      GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_init_func,
-                      "Offload function %s, is_empty=%d, #varDescs=%d, "
-                      "#waits=%d, signal=none\n",
-                      name, is_empty, vars_total, num_waits);
-        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_sent_pointer_data,
-                      "#Wait : %d \n", num_waits);
-        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_signal,
-                      "none %d\n", 0);
-    }
-    else {
-        OFFLOAD_DEBUG_TRACE_1(1,
-                      GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_init_func,
-                      "Offload function %s, is_empty=%d, #varDescs=%d, "
-                      "#waits=%d, signal=%p\n",
-                      name, is_empty, vars_total, num_waits,
-                      *signal);
-
-        OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_signal,
-                      "%d\n", signal);
-    }
-    OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
-                      c_offload_wait,
-                      "#Wait : %d  %p\n", num_waits, waits);
-
-    if (m_status != 0) {
-        m_status->result = OFFLOAD_SUCCESS;
-        m_status->device_number = m_device.get_logical_index();
-    }
-
-    m_need_runfunction = !is_empty;
-
-    // wait for dependencies to finish
-    if (!wait_dependencies(waits, num_waits)) {
-        cleanup();
-        return false;
-    }
-
-    // setup buffers
-    if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
-        cleanup();
-        return false;
-    }
-
-    // initiate send for pointers. Want to do it as early as possible.
-    if (!send_pointer_data(signal != 0)) {
-        cleanup();
-        return false;
-    }
-
-    // setup misc data for run function
-    if (!setup_misc_data(name)) {
-        cleanup();
-        return false;
-    }
-
-    // gather copyin data into buffer
-    if (!gather_copyin_data()) {
-        cleanup();
-        return false;
-    }
-
-    // Start the computation
-    if (!compute()) {
-        cleanup();
-        return false;
-    }
-
-    // initiate receive for pointers
-    if (!receive_pointer_data(signal != 0)) {
-        cleanup();
-        return false;
-    }
-
-    // if there is a signal save descriptor for the later use.
-    if (signal != 0) {
-        m_device.add_signal(*signal, this);
-        return true;
-    }
-
-    // wait for the offload to finish.
-    if (!offload_finish()) {
-        cleanup();
-        return false;
-    }
-
-    cleanup();
-    return true;
-}
-
-bool OffloadDescriptor::offload_finish()
-{
-    COIRESULT res;
-
-    // wait for compute dependencies to become signaled
-    if (m_in_deps_total > 0) {
-        OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
-
-        if (__offload_active_wait) {
-            // keep CPU busy
-            do {
-                res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
-            }
-            while (res == COI_TIME_OUT_REACHED);
-        }
-        else {
-            res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
-        }
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_event_wait, res);
-        }
-    }
-
-    // scatter copyout data received from target
-    if (!scatter_copyout_data()) {
-        return false;
-    }
-    // wait for receive dependencies to become signaled
-    if (m_out_deps_total > 0) {
-        OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
-
-        if (__offload_active_wait) {
-            // keep CPU busy
-            do {
-                res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
-            }
-            while (res == COI_TIME_OUT_REACHED);
-        }
-        else {
-            res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
-        }
-
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_event_wait, res);
-        }
-    }
-
-    // destroy buffers
-    {
-        OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
-
-        for (BufferList::const_iterator it = m_destroy_buffers.begin();
-             it != m_destroy_buffers.end(); it++) {
-            res = COI::BufferDestroy(*it);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_destroy, res);
-            }
-        }
-    }
-
-    return true;
-}
-
-void OffloadDescriptor::cleanup()
-{
-    // release device in orsl
-    ORSL::release(m_device.get_logical_index());
-
-    OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
-
-    // report stuff
-    Offload_Report_Epilog(get_timer_data());
-}
-
-bool OffloadDescriptor::is_signaled()
-{
-    bool signaled = true;
-    COIRESULT res;
-
-    // check compute and receive dependencies
-    if (m_in_deps_total > 0) {
-        res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
-        signaled = signaled && (res == COI_SUCCESS);
-    }
-    if (m_out_deps_total > 0) {
-        res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
-        signaled = signaled && (res == COI_SUCCESS);
-    }
-
-    return signaled;
-}
-
-// Send pointer data if source or destination or both of them are
-// noncontiguous. There is guarantee that length of destination enough for
-// transferred data.
-bool OffloadDescriptor::send_noncontiguous_pointer_data(
-    int i,
-    PtrData* src_data,
-    PtrData* dst_data,
-    COIEVENT *event
-    )
-{
-    int64_t offset_src, offset_dst;
-    int64_t length_src, length_dst;
-    int64_t length_src_cur, length_dst_cur;
-    int64_t send_size, data_sent = 0;
-    COIRESULT res;
-    bool dst_is_empty = true;
-    bool src_is_empty = true;
-
-    // Set length_src and length_dst
-    length_src = (m_vars_extra[i].read_rng_src) ?
-        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
-    length_dst = !m_vars[i].into ? length_src :
-                     (m_vars_extra[i].read_rng_dst) ?
-                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
-    send_size = (length_src < length_dst) ? length_src : length_dst;
-
-    // consequently get contiguous ranges,
-    // define corresponded destination offset and send data
-    do {
-        if (src_is_empty) {
-            if (m_vars_extra[i].read_rng_src) {
-                if (!get_next_range(m_vars_extra[i].read_rng_src,
-                         &offset_src)) {
-                    // source ranges are over - nothing to send
-                    break;
-                }
-            }
-            else if (data_sent == 0) {
-                offset_src = m_vars_extra[i].cpu_disp;
-            }
-            else {
-                break;
-            }
-            length_src_cur = length_src;
-        }
-        else {
-            // if source is contiguous or its contiguous range is greater
-            // than destination one
-            offset_src += send_size;
-        }
-        length_src_cur -= send_size;
-        src_is_empty = length_src_cur == 0;
-
-        if (dst_is_empty) {
-            if (m_vars[i].into) {
-                if (m_vars_extra[i].read_rng_dst) {
-                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
-                             &offset_dst)) {
-                        // destination ranges are over
-                        LIBOFFLOAD_ERROR(c_destination_is_over);
-                        return false;
-                    }
-                }
-                // into is contiguous.
-                else {
-                    offset_dst = m_vars[i].disp;
-                }
-                length_dst_cur = length_dst;
-            }
-            // same as source
-            else {
-                offset_dst = offset_src;
-                length_dst_cur = length_src;
-            }
-        }
-        else {
-            // if destination is contiguous or its contiguous range is greater
-            // than source one
-            offset_dst += send_size;
-        }
-        length_dst_cur -= send_size;
-        dst_is_empty = length_dst_cur == 0;
-
-        if (src_data != 0 && src_data->cpu_buf != 0) {
-            res = COI::BufferCopy(
-                dst_data->mic_buf,
-                src_data->cpu_buf,
-                m_vars[i].mic_offset - dst_data->alloc_disp +
-                m_vars[i].offset + offset_dst,
-                m_vars_extra[i].cpu_offset + offset_src,
-                send_size,
-                COI_COPY_UNSPECIFIED,
-                0, 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_copy, res);
-            }
-        }
-        else {
-            char *base = offload_get_src_base(m_vars[i].ptr,
-                m_vars[i].type.src);
-
-            res = COI::BufferWrite(
-                dst_data->mic_buf,
-                m_vars[i].mic_offset - dst_data->alloc_disp +
-                m_vars[i].offset + offset_dst,
-                base + offset_src,
-                send_size,
-                COI_COPY_UNSPECIFIED,
-                0, 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_write, res);
-            }
-        }
-        data_sent += length_src;
-    }
-    while (true);
-    return true;
-}
-
-bool OffloadDescriptor::send_pointer_data(bool is_async)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
-
-    uint64_t ptr_sent = 0;
-    COIRESULT res;
-
-    // Initiate send for pointer data
-    for (int i = 0; i < m_vars_total; i++) {
-        switch (m_vars[i].type.dst) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.in &&
-                    m_vars[i].flags.is_static_dstn) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-                    PtrData* dst_data = m_vars[i].into ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                    PtrData* src_data =
-                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
-                        m_vars[i].flags.is_static ?
-                           m_vars_extra[i].src_data : 0;
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        if (!send_noncontiguous_pointer_data(
-                                i, src_data, dst_data, event)) {
-                            return false;
-                        }
-                    }
-                    else if (src_data != 0 && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            dst_data->mic_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].direction.in && m_vars[i].size > 0) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-                    PtrData* dst_data = m_vars[i].into ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                    PtrData* src_data =
-                        VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
-                        m_vars[i].flags.is_static ?
-                            m_vars_extra[i].src_data : 0;
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        send_noncontiguous_pointer_data(
-                            i, src_data, dst_data, event);
-                    }
-                    else if (src_data != 0 && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            dst_data->mic_buf,
-                            m_vars[i].mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-                if (m_vars[i].direction.in &&
-                    m_vars[i].size > 0) {
-                    PtrData *ptr_data = m_vars[i].into ?
-                                        m_vars_extra[i].dst_data :
-                                        m_vars_extra[i].src_data;
-                    PtrData* src_data = m_vars_extra[i].src_data;
-
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        send_noncontiguous_pointer_data(
-                            i, src_data, ptr_data, event);
-                    }
-                    else if (src_data && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            ptr_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].offset + ptr_data->mic_offset -
-                            ptr_data->alloc_disp +
-                            m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            ptr_data->mic_buf,
-                            ptr_data->mic_offset - ptr_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (m_vars[i].direction.in &&
-                    m_vars[i].size > 0) {
-                    PtrData *dst_data = m_vars[i].into ?
-                                        m_vars_extra[i].dst_data :
-                                        m_vars_extra[i].src_data;
-                    PtrData* src_data =
-                        (VAR_TYPE_IS_PTR(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_DV_DATA(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) ||
-                        VAR_TYPE_IS_SCALAR(m_vars[i].type.src) &&
-                        m_vars[i].flags.is_static) ?
-                            m_vars_extra[i].src_data : 0;
-                    COIEVENT *event =
-                        (is_async ||
-                         m_vars[i].size >= __offload_use_async_buffer_write) ?
-                        &m_in_deps[m_in_deps_total++] : 0;
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        send_noncontiguous_pointer_data(
-                            i, src_data, dst_data, event);
-                    }
-                    else if (src_data && src_data->cpu_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_data->mic_buf,
-                            src_data->cpu_buf,
-                            m_vars[i].offset - dst_data->alloc_disp +
-                            dst_data->mic_offset +
-                            m_vars[i].disp,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        char *base = offload_get_src_base(m_vars[i].ptr,
-                                                          m_vars[i].type.src);
-                        res = COI::BufferWrite(
-                            dst_data->mic_buf,
-                            dst_data->mic_offset - dst_data->alloc_disp +
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            0, 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_write, res);
-                        }
-                    }
-
-                    ptr_sent += m_vars[i].size;
-                }
-                break;
-
-            default:
-                break;
-        }
-
-        // alloc field isn't used at target.
-        // We can reuse it for offset of array pointers.
-        if (m_vars_extra[i].is_arr_ptr_el) {
-            m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
-        }
-    }
-
-    if (m_status) {
-        m_status->data_sent += ptr_sent;
-    }
-
-    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
-    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
-                  c_offload_sent_pointer_data,
-                  "Total pointer data sent to target: [%lld] bytes\n",
-                  ptr_sent);
-
-    return true;
-}
-
-bool OffloadDescriptor::gather_copyin_data()
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
-
-    if (m_need_runfunction && m_in_datalen > 0) {
-        COIMAPINSTANCE map_inst;
-        char *data;
-
-        // init marshaller
-        if (m_inout_buf != 0) {
-            OffloadTimer timer_map(get_timer_data(),
-                                   c_offload_host_map_in_data_buffer);
-
-            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
-                                           COI_MAP_WRITE_ENTIRE_BUFFER,
-                                           0, 0, 0, &map_inst,
-                                           reinterpret_cast<void**>(&data));
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_map, res);
-            }
-        }
-        else {
-            data = (char*) m_func_desc + m_func_desc->data_offset;
-        }
-
-        // send variable descriptors
-        memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
-        data += m_vars_total * sizeof(VarDesc);
-
-        // init marshaller
-        m_in.init_buffer(data, m_in_datalen);
-
-        // Gather copy data into buffer
-        for (int i = 0; i < m_vars_total; i++) {
-            bool src_is_for_mic = (m_vars[i].direction.out ||
-                                   m_vars[i].into == NULL);
-            PtrData* ptr_data = src_is_for_mic ?
-                                m_vars_extra[i].src_data :
-                                m_vars_extra[i].dst_data;
-            if (m_vars[i].flags.alloc_disp) {
-                m_in.send_data(&ptr_data->alloc_disp,
-                               sizeof(ptr_data->alloc_disp));
-            }
-
-            // send sink address to the target
-            if (m_vars[i].flags.sink_addr) {
-                m_in.send_data(&ptr_data->mic_addr,
-                               sizeof(ptr_data->mic_addr));
-            }
-
-            switch (m_vars[i].type.dst) {
-                case c_data_ptr_array:
-                    break;
-                case c_data:
-                case c_void_ptr:
-                case c_cean_var:
-                    if (m_vars[i].direction.in &&
-                        !m_vars[i].flags.is_static_dstn) {
-
-                        char *ptr = offload_get_src_base(m_vars[i].ptr,
-                                                         m_vars[i].type.src);
-                        if (m_vars[i].type.dst == c_cean_var) {
-                            // offset and length are derived from the array
-                            // descriptor
-                            int64_t size = m_vars[i].size;
-                            int64_t disp = m_vars[i].disp;
-                            m_in.send_data(reinterpret_cast<char*>(&size),
-                                           sizeof(int64_t));
-                            m_in.send_data(reinterpret_cast<char*>(&disp),
-                                           sizeof(int64_t));
-                        }
-
-                        m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
-                                       m_vars[i].size);
-                    }
-                    break;
-
-                case c_dv:
-                    if (m_vars[i].direction.bits ||
-                        m_vars[i].alloc_if ||
-                        m_vars[i].free_if) {
-                        // send dope vector excluding base
-                        char *ptr = static_cast<char*>(m_vars[i].ptr);
-                        m_in.send_data(ptr + sizeof(uint64_t),
-                                       m_vars[i].size - sizeof(uint64_t));
-                    }
-                    break;
-
-                case c_data_ptr:
-                    // send to target addresses of obsolete
-                    // stacks to be released
-                    if (m_vars[i].flags.is_stack_buf &&
-                        !m_vars[i].direction.bits &&
-                        m_vars[i].alloc_if &&
-                        m_vars[i].size != 0) {
-                        for (PtrDataList::iterator it =
-                            m_destroy_stack.begin();
-                            it != m_destroy_stack.end(); it++) {
-                            PtrData * ptr_data = *it;
-                            m_in.send_data(&(ptr_data->mic_addr),
-                                sizeof(ptr_data->mic_addr));
-                        }
-                    }
-                    break;
-                case c_func_ptr:
-                    if (m_vars[i].direction.in) {
-                        m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
-                    }
-                    break;
-
-                default:
-                    break;
-            }
-        }
-
-        if (m_status) {
-            m_status->data_sent += m_in.get_tfr_size();
-        }
-
-        if (m_func_desc->data_offset == 0) {
-            OffloadTimer timer_unmap(get_timer_data(),
-                                     c_offload_host_unmap_in_data_buffer);
-            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_unmap, res);
-            }
-        }
-    }
-
-    OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
-    OFFLOAD_DEBUG_TRACE_1(1,
-                  GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
-                  "Total copyin data sent to target: [%lld] bytes\n",
-                  m_in.get_tfr_size());
-
-    return true;
-}
-
-bool OffloadDescriptor::compute()
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
-
-    if (m_need_runfunction) {
-        OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
-                              c_offload_compute, "Compute task on MIC\n");
-
-        void* misc = m_func_desc;
-        int   misc_len = m_func_desc_size;
-        void* ret = 0;
-        int   ret_len = 0;
-
-        if (m_func_desc->data_offset != 0) {
-            misc_len += m_in_datalen;
-
-            if (m_out_datalen > 0) {
-                ret = (char*) m_func_desc + m_func_desc->data_offset;
-                ret_len = m_out_datalen;
-            }
-        }
-
-        // dispatch task
-        COIRESULT res;
-        COIEVENT event;
-        res = m_device.compute(m_compute_buffers,
-                               misc, misc_len,
-                               ret, ret_len,
-                               m_in_deps_total,
-                               m_in_deps_total > 0 ? m_in_deps : 0,
-                               &event);
-        if (res != COI_SUCCESS) {
-            if (m_status != 0) {
-                m_status->result = translate_coi_error(res);
-                return false;
-            }
-            report_coi_error(c_pipeline_run_func, res);
-        }
-
-        m_in_deps_total = 1;
-        m_in_deps[0] = event;
-    }
-
-    return true;
-}
-
-// receive pointer data if source or destination or both of them are
-// noncontiguous. There is guarantee that length of destination enough for
-// transferred data.
-bool OffloadDescriptor::receive_noncontiguous_pointer_data(
-    int i,
-    char* base,
-    COIBUFFER dst_buf,
-    COIEVENT *event
-)
-{
-    int64_t offset_src, offset_dst;
-    int64_t length_src, length_dst;
-    int64_t length_src_cur, length_dst_cur;
-    int64_t receive_size, data_received = 0;
-    COIRESULT res;
-    bool dst_is_empty = true;
-    bool src_is_empty = true;
-
-    // Set length_src and length_dst
-    length_src = (m_vars_extra[i].read_rng_src) ?
-        m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
-    length_dst = !m_vars[i].into ? length_src :
-                     (m_vars_extra[i].read_rng_dst) ?
-                     m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
-    receive_size = (length_src < length_dst) ? length_src : length_dst;
-
-    // consequently get contiguous ranges,
-    // define corresponded destination offset and receive data
-    do {
-        // get sorce offset
-        if (src_is_empty) {
-            if (m_vars_extra[i].read_rng_src) {
-                if (!get_next_range(m_vars_extra[i].read_rng_src,
-                         &offset_src)) {
-                    // source ranges are over - nothing to send
-                    break;
-                }
-            }
-            else if (data_received == 0) {
-                offset_src = 0;
-            }
-            else {
-                break;
-            }
-            length_src_cur = length_src;
-        }
-        else {
-            // if source is contiguous or its contiguous range is greater
-            // than destination one
-            offset_src += receive_size;
-        }
-        length_src_cur -= receive_size;
-        src_is_empty = length_src_cur == 0;
-
-        // get destination offset
-        if (dst_is_empty) {
-            if (m_vars[i].into) {
-                if (m_vars_extra[i].read_rng_dst) {
-                    if (!get_next_range(m_vars_extra[i].read_rng_dst,
-                             &offset_dst)) {
-                        // destination ranges are over
-                        LIBOFFLOAD_ERROR(c_destination_is_over);
-                        return false;
-                    }
-                }
-                // destination is contiguous.
-                else {
-                    offset_dst = m_vars_extra[i].cpu_disp;
-                }
-                length_dst_cur = length_dst;
-            }
-            // same as source
-            else {
-                offset_dst = offset_src;
-                length_dst_cur = length_src;
-            }
-        }
-        else {
-            // if destination is contiguous or its contiguous range is greater
-            // than source one
-            offset_dst += receive_size;
-        }
-        length_dst_cur -= receive_size;
-        dst_is_empty = length_dst_cur == 0;
-
-        if (dst_buf != 0) {
-            res = COI::BufferCopy(
-                dst_buf,
-                m_vars_extra[i].src_data->mic_buf,
-                m_vars_extra[i].cpu_offset + offset_dst,
-                m_vars[i].offset + offset_src +
-                m_vars[i].mic_offset -
-                m_vars_extra[i].src_data->alloc_disp,
-                receive_size,
-                COI_COPY_UNSPECIFIED,
-                m_in_deps_total,
-                m_in_deps_total > 0 ? m_in_deps : 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_copy, res);
-            }
-        }
-        else {
-            res = COI::BufferRead(
-                m_vars_extra[i].src_data->mic_buf,
-                m_vars[i].offset + offset_src +
-                m_vars[i].mic_offset -
-                m_vars_extra[i].src_data->alloc_disp,
-                base + offset_dst,
-                receive_size,
-                COI_COPY_UNSPECIFIED,
-                m_in_deps_total,
-                m_in_deps_total > 0 ? m_in_deps : 0,
-                event);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_read, res);
-            }
-        }
-        data_received += receive_size;
-    }
-    while (true);
-    return true;
-}
-
-bool OffloadDescriptor::receive_pointer_data(bool is_async)
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
-
-    uint64_t ptr_received = 0;
-    COIRESULT res;
-
-    for (int i = 0; i < m_vars_total; i++) {
-        switch (m_vars[i].type.src) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.out &&
-                    m_vars[i].flags.is_static) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_in_deps_total > 0 ||
-                         m_vars[i].size >= __offload_use_async_buffer_read) ?
-                        &m_out_deps[m_out_deps_total++] : 0;
-                    PtrData *ptr_data = NULL;
-                    COIBUFFER dst_buf = NULL; // buffer at host
-                    char *base;
-
-                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
-                        ptr_data = m_vars[i].into ?
-                                   m_vars_extra[i].dst_data :
-                                   m_vars_extra[i].src_data;
-                    }
-                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
-                        if (m_vars[i].flags.is_static_dstn) {
-                            ptr_data = m_vars[i].into ?
-                                       m_vars_extra[i].dst_data :
-                                       m_vars_extra[i].src_data;
-                        }
-                    }
-                    dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
-                    if (dst_buf == NULL) {
-                        base = offload_get_src_base(
-                            m_vars[i].into ?
-                            static_cast<char*>(m_vars[i].into) :
-                            static_cast<char*>(m_vars[i].ptr),
-                            m_vars[i].type.dst);
-                    }
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        receive_noncontiguous_pointer_data(
-                            i, base, dst_buf, event);
-                    }
-                    else if (dst_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_buf,
-                            m_vars_extra[i].src_data->mic_buf,
-                            m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].offset + m_vars[i].disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                       res = COI::BufferRead(
-                            m_vars_extra[i].src_data->mic_buf,
-                            m_vars[i].offset + m_vars[i].disp,
-                            base + m_vars_extra[i].cpu_offset +
-                            m_vars_extra[i].cpu_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_read, res);
-                        }
-                    }
-                    ptr_received += m_vars[i].size;
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-            case c_dv_ptr: {
-                COIBUFFER dst_buf = NULL; // buffer on host
-                if (m_vars[i].direction.out && m_vars[i].size > 0) {
-                    COIEVENT *event =
-                        (is_async ||
-                         m_in_deps_total > 0 ||
-                         m_vars[i].size >= __offload_use_async_buffer_read) ?
-                        &m_out_deps[m_out_deps_total++] : 0;
-
-                    uint64_t dst_offset = 0;
-                    char *base = static_cast<char*>(m_vars[i].ptr);
-
-                    if (VAR_TYPE_IS_PTR(m_vars[i].type.dst)) {
-                        PtrData *ptr_data = m_vars[i].into ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                        dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
-                        if (dst_buf == NULL) {
-                            base = m_vars[i].into ?
-                                   *static_cast<char**>(m_vars[i].into) :
-                                   *static_cast<char**>(m_vars[i].ptr);
-                        }
-                        dst_offset = m_vars_extra[i].cpu_offset +
-                                     m_vars_extra[i].cpu_disp;
-                    }
-                    else if (VAR_TYPE_IS_SCALAR(m_vars[i].type.dst)) {
-                        if (m_vars[i].flags.is_static_dstn) {
-                            dst_buf = m_vars[i].into ?
-                                        m_vars_extra[i].dst_data->cpu_buf :
-                                        m_vars_extra[i].src_data->cpu_buf;
-                        }
-                        if (dst_buf == NULL) {
-                            base = offload_get_src_base(
-                                m_vars[i].into ?
-                                static_cast<char*>(m_vars[i].into) :
-                                static_cast<char*>(m_vars[i].ptr),
-                                m_vars[i].type.dst);
-                        }
-                        dst_offset = m_vars_extra[i].cpu_offset +
-                                     m_vars_extra[i].cpu_disp;
-                    }
-                    else if (VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst) ||
-                             VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst)) {
-                        PtrData *ptr_data = m_vars[i].into != 0 ?
-                                            m_vars_extra[i].dst_data :
-                                            m_vars_extra[i].src_data;
-                        dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
-                        if (dst_buf == NULL) {
-                            base = offload_get_src_base(
-                                m_vars[i].into ?
-                                static_cast<char*>(m_vars[i].into) :
-                                static_cast<char*>(m_vars[i].ptr),
-                                m_vars[i].type.dst);
-
-                        }
-                        dst_offset = m_vars_extra[i].cpu_offset +
-                                     m_vars_extra[i].cpu_disp;
-                    }
-
-                    if (m_vars[i].flags.is_noncont_src ||
-                        m_vars[i].flags.is_noncont_dst) {
-                        receive_noncontiguous_pointer_data(
-                            i, base, dst_buf, event);
-                    }
-                    else if (dst_buf != 0) {
-                        res = COI::BufferCopy(
-                            dst_buf,
-                            m_vars_extra[i].src_data->mic_buf,
-                            dst_offset,
-                            m_vars[i].offset + m_vars[i].disp +
-                                m_vars[i].mic_offset -
-                                m_vars_extra[i].src_data->alloc_disp,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_copy, res);
-                        }
-                    }
-                    else {
-                        res = COI::BufferRead(
-                            m_vars_extra[i].src_data->mic_buf,
-                            m_vars[i].offset + m_vars[i].disp +
-                                m_vars[i].mic_offset -
-                                m_vars_extra[i].src_data->alloc_disp,
-                            base + dst_offset,
-                            m_vars[i].size,
-                            COI_COPY_UNSPECIFIED,
-                            m_in_deps_total,
-                            m_in_deps_total > 0 ? m_in_deps : 0,
-                            event);
-                        if (res != COI_SUCCESS) {
-                            if (m_status != 0) {
-                                m_status->result = translate_coi_error(res);
-                                return false;
-                            }
-                            report_coi_error(c_buf_read, res);
-                        }
-                    }
-                    ptr_received += m_vars[i].size;
-                }
-                break;
-            }
-
-            default:
-                break;
-        }
-
-        // destroy buffers for obsolete stacks
-        if (m_destroy_stack.size() != 0) {
-            for (PtrDataList::iterator it = m_destroy_stack.begin();
-                it != m_destroy_stack.end(); it++) {
-                PtrData *ptr_data = *it;
-                m_destroy_buffers.push_back(ptr_data->mic_buf);
-                OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
-                                  ptr_data->mic_addr);
-            }
-            m_destroy_stack.clear();
-        }
-        if (m_vars[i].free_if) {
-            // remove association for automatic variables
-            if (m_is_openmp && !m_vars[i].flags.is_static &&
-                (m_vars[i].type.src == c_data ||
-                 m_vars[i].type.src == c_void_ptr ||
-                 m_vars[i].type.src == c_cean_var)) {
-                AutoData *auto_data = m_vars_extra[i].auto_data;
-                if (auto_data != 0 && auto_data->remove_reference() == 0) {
-                    m_device.remove_auto_data(auto_data->cpu_addr.start());
-                }
-            }
-
-            // destroy buffers
-            if (m_vars[i].direction.out || m_vars[i].into == NULL) {
-                if (!VAR_TYPE_IS_PTR(m_vars[i].type.src) &&
-                    !VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.src) &&
-                    !VAR_TYPE_IS_DV_DATA(m_vars[i].type.src)) {
-                    continue;
-                }
-
-                PtrData *ptr_data = m_vars_extra[i].src_data;
-                if (ptr_data->remove_reference() == 0) {
-                    // destroy buffers
-                    if (ptr_data->cpu_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
-                    }
-                    if (ptr_data->mic_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->mic_buf);
-                    }
-                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
-                                  ptr_data->cpu_addr.start());
-
-                    // remove association from map
-                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
-                }
-            }
-            else if (VAR_TYPE_IS_PTR(m_vars[i].type.dst) ||
-                     VAR_TYPE_IS_DV_DATA_SLICE(m_vars[i].type.dst) ||
-                     VAR_TYPE_IS_DV_DATA(m_vars[i].type.dst)) {
-                PtrData *ptr_data = m_vars_extra[i].dst_data;
-                if (ptr_data->remove_reference() == 0) {
-                    // destroy buffers
-                    if (ptr_data->cpu_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->cpu_buf);
-                    }
-                    if (ptr_data->mic_buf != 0) {
-                        m_destroy_buffers.push_back(ptr_data->mic_buf);
-                    }
-                    OFFLOAD_TRACE(3, "Removing association for addr %p\n",
-                                  ptr_data->cpu_addr.start());
-
-                    // remove association from map
-                    m_device.remove_ptr_data(ptr_data->cpu_addr.start());
-                }
-            }
-        }
-    }
-
-    if (m_status) {
-        m_status->data_received += ptr_received;
-    }
-
-    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
-    OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
-                  c_offload_received_pointer_data,
-                  "Total pointer data received from target: [%lld] bytes\n",
-                  ptr_received);
-
-    return true;
-}
-
-bool OffloadDescriptor::scatter_copyout_data()
-{
-    OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
-
-    if (m_need_runfunction && m_out_datalen > 0) {
-
-        // total size that need to be transferred from target to host
-        COIMAPINSTANCE map_inst;
-        COIRESULT res;
-        char *data;
-
-        // output data buffer
-        if (m_func_desc->data_offset == 0) {
-            OffloadTimer timer_map(get_timer_data(),
-                                   c_offload_host_map_out_data_buffer);
-
-            COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
-                                           COI_MAP_READ_ONLY, 0, 0, 0,
-                                           &map_inst,
-                                            reinterpret_cast<void**>(&data));
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_map, res);
-            }
-        }
-        else {
-            data = (char*) m_func_desc + m_func_desc->data_offset;
-        }
-
-        // get timing data
-        OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
-        data += OFFLOAD_TIMER_DATALEN();
-
-        // initialize output marshaller
-        m_out.init_buffer(data, m_out_datalen);
-
-        for (int i = 0; i < m_vars_total; i++) {
-            switch (m_vars[i].type.src) {
-                case c_data_ptr_array:
-                    break;
-                case c_data:
-                case c_void_ptr:
-                case c_cean_var:
-                    if (m_vars[i].direction.out &&
-                        !m_vars[i].flags.is_static) {
-
-                        if (m_vars[i].into) {
-                            char *ptr = offload_get_src_base(
-                                static_cast<char*>(m_vars[i].into),
-                                m_vars[i].type.dst);
-                            m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
-                                               m_vars[i].size);
-                        }
-                        else {
-                            m_out.receive_data(
-                                static_cast<char*>(m_vars[i].ptr) +
-                                    m_vars_extra[i].cpu_disp,
-                                m_vars[i].size);
-                        }
-                    }
-                    break;
-
-                case c_func_ptr:
-                    if (m_vars[i].direction.out) {
-                        m_out.receive_func_ptr((const void**) m_vars[i].ptr);
-                    }
-                    break;
-
-                default:
-                    break;
-            }
-        }
-
-        if (m_status) {
-            m_status->data_received += m_out.get_tfr_size();
-        }
-
-        if (m_func_desc->data_offset == 0) {
-            OffloadTimer timer_unmap(get_timer_data(),
-                                     c_offload_host_unmap_out_data_buffer);
-
-            COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
-            if (res != COI_SUCCESS) {
-                if (m_status != 0) {
-                    m_status->result = translate_coi_error(res);
-                    return false;
-                }
-                report_coi_error(c_buf_unmap, res);
-            }
-        }
-    }
-
-    OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
-    OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
-                  m_out.get_tfr_size());
-
-    return true;
-}
-
-void get_arr_desc_numbers(
-    const arr_desc *ap,
-    int64_t el_size,
-    int64_t &offset,
-    int64_t &size,
-    int     &el_number,
-    CeanReadRanges* &ptr_ranges
-)
-{
-    if (is_arr_desc_contiguous(ap)) {
-        ptr_ranges = NULL;
-        __arr_data_offset_and_length(ap, offset, size);
-        el_number = size / el_size;
-    }
-    else {
-        ptr_ranges = init_read_ranges_arr_desc(ap);
-        el_number = (ptr_ranges->range_size / el_size) *
-                    ptr_ranges->range_max_number;
-        size = ptr_ranges->range_size;
-    }
-}
-
-arr_desc * make_arr_desc(
-    void*   ptr_val,
-    int64_t extent_start_val,
-    int64_t extent_elements_val,
-    int64_t size
-)
-{
-    arr_desc *res;
-    res = (arr_desc *)malloc(sizeof(arr_desc));
-    res->base = reinterpret_cast<int64_t>(ptr_val);
-    res->rank = 1;
-    res->dim[0].size = size;
-    res->dim[0].lindex = 0;
-    res->dim[0].lower = extent_start_val;
-    res->dim[0].upper = extent_elements_val + extent_start_val - 1;
-    res->dim[0].stride = 1;
-    return res;
-}
-
-bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
-{
-    int             pointers_number;
-    int             tmp_val;
-    int             new_index = m_vars_total;
-    const arr_desc *ap;
-    const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
-    int             flags = vd3->array_fields;
-    bool            src_is_for_mic = (m_vars[i].direction.out ||
-                                      m_vars[i].into == NULL);
-
-    ReadArrElements<void *>  ptr;
-    ReadArrElements<void *>  into;
-    ReadArrElements<int64_t> ext_start;
-    ReadArrElements<int64_t> ext_elements;
-    ReadArrElements<int64_t> align;
-    ReadArrElements<int64_t> alloc_if;
-    ReadArrElements<int64_t> free_if;
-    ReadArrElements<int64_t> into_start;
-    ReadArrElements<int64_t> into_elem;
-    ReadArrElements<int64_t> alloc_start;
-    ReadArrElements<int64_t> alloc_elem;
-
-
-    ap = static_cast<const arr_desc*>(vd3->ptr_array);
-
-    // "pointers_number" for total number of transferred pointers.
-    // For each of them we create new var_desc and put it at the bottom
-    // of the var_desc's array
-    get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
-        pointers_number, ptr.ranges);
-    ptr.base = reinterpret_cast<char*>(ap->base);
-
-    // 2. prepare memory for new var_descs
-    m_vars_total += pointers_number;
-    m_vars       = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
-    m_vars_extra =
-        (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
-    m_in_deps    =
-        (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
-    m_out_deps   =
-        (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
-
-    // 3. Prepare for reading new var_desc's fields
-    //    EXTENT START
-    if ((flags & (1<<flag_extent_start_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->extent_start);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
-            ext_start.size, tmp_val, ext_start.ranges);
-        ext_start.base = reinterpret_cast<char*>(ap->base);
-        ext_start.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
-        ext_start.val = (int64_t)vd3->extent_start;
-    }
-    else {
-        ext_start.val = 0;
-    }
-
-    //    EXTENT ELEMENTS NUMBER
-    if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->extent_elements);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
-            ext_elements.offset, ext_elements.size,
-            tmp_val, ext_elements.ranges);
-        ext_elements.base = reinterpret_cast<char*>(ap->base);
-        ext_elements.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
-        ext_elements.val = (int64_t)vd3->extent_elements;
-    }
-    else {
-        ext_elements.val = m_vars[i].count;
-    }
-
-    //    ALLOC_IF
-    if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->alloc_if_array);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
-            alloc_if.size, tmp_val, alloc_if.ranges);
-        alloc_if.base = reinterpret_cast<char*>(ap->base);
-        alloc_if.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
-            return false;
-        }
-    }
-    else {
-        alloc_if.val = m_vars[i].count;
-    }
-
-    //    FREE_IF
-    if ((flags & (1<<flag_free_if_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->free_if_array);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
-            free_if.size, tmp_val, free_if.ranges);
-        free_if.base = reinterpret_cast<char*>(ap->base);
-        free_if.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
-            return false;
-        }
-    }
-    else {
-        free_if.val = m_vars[i].count;
-    }
-
-    //    ALIGN
-
-    if ((flags & (1<<flag_align_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->align_array);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
-            align.size, tmp_val, align.ranges);
-        align.base = reinterpret_cast<char*>(ap->base);
-        align.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
-            return false;
-        }
-    }
-    else {
-        align.val = m_vars[i].align;
-    }
-
-    // 3.1 INTO
-
-    if (m_vars[i].into) {
-        ap = static_cast<const arr_desc*>(m_vars[i].into);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
-            into.size, tmp_val, into.ranges);
-        into.base = reinterpret_cast<char*>(ap->base);
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
-            return false;
-        }
-    }
-
-    // 3.2 INTO_START
-
-    if ((flags & (1<<flag_into_start_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->into_start);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
-            into_start.size, tmp_val, into_start.ranges);
-        into_start.base = reinterpret_cast<char*>(ap->base);
-        into_start.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
-        into_start.val = (int64_t)vd3->into_start;
-    }
-    else {
-        into_start.val = 0;
-    }
-
-    // 3.3 INTO_ELEMENTS
-
-    if ((flags & (1<<flag_into_elements_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->into_elements);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
-            into_elem.size, tmp_val, into_elem.ranges);
-        into_elem.base = reinterpret_cast<char*>(ap->base);
-        into_elem.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
-        into_elem.val = (int64_t)vd3->into_elements;
-    }
-    else {
-        into_elem.val = m_vars[i].count;
-    }
-
-    //    alloc_start
-
-    if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->alloc_start);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
-            alloc_start.offset, alloc_start.size, tmp_val,
-            alloc_start.ranges);
-        alloc_start.base = reinterpret_cast<char*>(ap->base);
-        alloc_start.el_size = ap->dim[ap->rank - 1].size;
-
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
-        alloc_start.val = (int64_t)vd3->alloc_start;
-    }
-    else {
-        alloc_start.val = 0;
-    }
-
-    //    alloc_elem
-
-    if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
-        ap = static_cast<const arr_desc*>(vd3->alloc_elements);
-        get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
-            alloc_elem.size, tmp_val, alloc_elem.ranges);
-        alloc_elem.base = reinterpret_cast<char*>(ap->base);
-        alloc_elem.el_size = ap->dim[ap->rank - 1].size;
-        if (tmp_val < pointers_number) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
-                             "alloc_extent elements");
-            return false;
-        }
-    }
-    else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
-        alloc_elem.val = (int64_t)vd3->alloc_elements;
-    }
-    else {
-        alloc_elem.val = 0;
-    }
-
-    for (int k = 0; k < pointers_number; k++) {
-        int type = flags & 0x3f;
-        int type_src, type_dst;
-        //  Get new values
-        // type_src, type_dst
-        type_src = type_dst = (type == c_data_ptr_array) ?
-                              c_data_ptr   : (type == c_func_ptr_array) ?
-                              c_func_ptr   : (type == c_void_ptr_array) ?
-                              c_void_ptr   : (type == c_string_ptr_array) ?
-                              c_string_ptr : 0;
-
-        // Get ptr val
-        if (!ptr.read_next(true)) {
-            break;
-        }
-        else {
-            ptr.val = (void*)(ptr.base + ptr.offset);
-        }
-
-        // !!! If we got error at phase of reading - it's an internal
-        // !!! error, as we must detect mismatch before
-
-        // Get into val
-        if (m_vars[i].into) {
-            if (!into.read_next(true)) {
-                LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
-                LIBOFFLOAD_ABORT;
-            }
-            else {
-                into.val = (void*)(into.base + into.offset);
-            }
-        }
-
-        // Get other components of the clause
-        if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!ext_elements.read_next(
-                flags & (1<<flag_extent_elements_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!align.read_next(flags & (1<<flag_align_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
-            LIBOFFLOAD_ABORT;
-        }
-        if (!alloc_elem.read_next(
-                 flags & (1<<flag_alloc_elements_is_array))) {
-            LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
-            LIBOFFLOAD_ABORT;
-        }
-
-        m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
-        m_vars[new_index + k].alloc_if = alloc_if.val;
-        m_vars[new_index + k].free_if = free_if.val;
-        m_vars[new_index + k].align = align.val;
-        m_vars[new_index + k].mic_offset = 0;
-        m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
-        m_vars[new_index + k].offset = 0;
-        m_vars[new_index + k].size = m_vars[i].size;
-
-        if (ext_start.val == 0) {
-            m_vars[new_index + k].count = ext_elements.val;
-            m_vars[new_index + k].ptr = ptr.val;
-            if (type_src == c_string_ptr) {
-                m_vars[new_index + k].size = 0;
-            }
-        }
-        else {
-            m_vars[new_index + k].count = 0;
-            m_vars[new_index + k].ptr =
-                static_cast<void*>(make_arr_desc(
-                ptr.val,
-                ext_start.val,
-                ext_elements.val,
-                m_vars[i].size));
-
-            type_src = type_src == c_data_ptr ? c_cean_var_ptr :
-                                   c_string_ptr ? c_cean_var_ptr :
-                                   type_src;
-            if (!m_vars[i].into) {
-                type_dst = type_src;
-            }
-        }
-
-        if (m_vars[i].into && into_elem.val != 0) {
-            m_vars[new_index + k].into =
-                static_cast<void*>(make_arr_desc(
-                into.val,
-                into_start.val,
-                into_elem.val,
-                m_vars[i].size));
-            type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
-                       (type == c_string_ptr_array) ? c_cean_var_ptr :
-                        type_src;
-        }
-        else {
-            m_vars[new_index + k].into = NULL;
-        }
-
-        if (alloc_elem.val != 0) {
-            m_vars[new_index + k].alloc =
-                static_cast<void*>(make_arr_desc(
-                ptr.val,
-                alloc_start.val,
-                alloc_elem.val,
-                m_vars[i].size));
-        }
-        else {
-            m_vars[new_index + k].alloc = NULL;
-        }
-
-        m_vars[new_index + k].type.src = type_src;
-        m_vars[new_index + k].type.dst = type_dst;
-
-        m_vars_extra[new_index + k].is_arr_ptr_el = 1;
-        m_vars_extra[new_index + k].ptr_arr_offset =
-            src_is_for_mic ? ptr.offset : into.offset;
-    }
-    // count and alloc fields are useless at target. They can be reused
-    // for pointer arrays.
-    m_vars[i].count = pointers_number;
-    m_vars[i].ptr_arr_offset = new_index;
-    return true;
-}
-
-static void __offload_fini_library(void)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
-    if (mic_engines_total > 0) {
-        delete[] mic_engines;
-
-        if (mic_proxy_fs_root != 0) {
-            free(mic_proxy_fs_root);
-            mic_proxy_fs_root = 0;
-        }
-
-        if (mic_library_path != 0) {
-            free(mic_library_path);
-            mic_library_path = 0;
-        }
-
-        // destroy thread key
-        thread_key_delete(mic_thread_key);
-    }
-
-    // unload COI library
-    if (COI::is_available) {
-        COI::fini();
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
-}
-
-static void __offload_init_library_once(void)
-{
-    COIRESULT res;
-    uint32_t num_devices;
-    std::bitset<MIC_ENGINES_MAX> devices;
-
-    prefix = report_get_message_str(c_report_host);
-
-    // initialize trace
-    const char *env_var = getenv(htrace_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            console_enabled = new_val & 0x0f;
-        }
-    }
-
-    env_var = getenv(offload_report_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t env_val;
-        if (__offload_parse_int_string(env_var, env_val)) {
-            if (env_val == OFFLOAD_REPORT_1 ||
-                env_val == OFFLOAD_REPORT_2 ||
-                env_val == OFFLOAD_REPORT_3) {
-                offload_report_level = env_val;
-            }
-            else {
-                LIBOFFLOAD_ERROR(c_invalid_env_report_value,
-                                 offload_report_envname);
-            }
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
-                             offload_report_envname);
-        }
-    }
-    else if (!offload_report_level) {
-        env_var = getenv(timer_envname);
-        if (env_var != 0 && *env_var != '\0') {
-            timer_enabled = atoi(env_var);
-        }
-    }
-
-    // initialize COI
-    if (!COI::init()) {
-        return;
-    }
-
-    // get number of devices installed in the system
-    res = COI::EngineGetCount(COI_ISA_KNC, &num_devices);
-    if (res != COI_SUCCESS) {
-        return;
-    }
-
-    if (num_devices > MIC_ENGINES_MAX) {
-        num_devices = MIC_ENGINES_MAX;
-    }
-
-    // fill in the list of devices that can be used for offloading
-    env_var = getenv("OFFLOAD_DEVICES");
-    if (env_var != 0) {
-        if (strcasecmp(env_var, "none") != 0) {
-            // value is composed of comma separated physical device indexes
-            char *buf = strdup(env_var);
-            char *str, *ptr;
-            for (str = strtok_r(buf, ",", &ptr); str != 0;
-                 str = strtok_r(0, ",", &ptr)) {
-                // convert string to an int
-                int64_t num;
-                if (!__offload_parse_int_string(str, num)) {
-                    LIBOFFLOAD_ERROR(c_mic_init5);
-
-                    // fallback to using all installed devices
-                    devices.reset();
-                    for (int i = 0; i < num_devices; i++) {
-                        devices.set(i);
-                    }
-                    break;
-                }
-                if (num < 0 || num >= num_devices) {
-                    LIBOFFLOAD_ERROR(c_mic_init6, num);
-                    continue;
-                }
-                devices.set(num);
-            }
-            free(buf);
-        }
-    }
-    else {
-        // use all available devices
-        for (int i = 0; i < num_devices; i++) {
-            COIENGINE engine;
-            res = COI::EngineGetHandle(COI_ISA_KNC, i, &engine);
-            if (res == COI_SUCCESS) {
-                devices.set(i);
-            }
-        }
-    }
-
-    mic_engines_total = devices.count();
-
-    // no need to continue if there are no devices to offload to
-    if (mic_engines_total <= 0) {
-        return;
-    }
-
-    // initialize indexes for available devices
-    mic_engines = new Engine[mic_engines_total];
-    for (int p_idx = 0, l_idx = 0; p_idx < num_devices; p_idx++) {
-        if (devices[p_idx]) {
-            mic_engines[l_idx].set_indexes(l_idx, p_idx);
-            l_idx++;
-        }
-    }
-
-    // library search path for device binaries
-    env_var = getenv("MIC_LD_LIBRARY_PATH");
-    if (env_var != 0) {
-        mic_library_path = strdup(env_var);
-    }
-
-    // memory size reserved for COI buffers
-    env_var = getenv("MIC_BUFFERSIZE");
-    if (env_var != 0) {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            mic_buffer_size = new_size;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
-        }
-    }
-
-    // determine stacksize for the pipeline on the device
-    env_var = getenv("MIC_STACKSIZE");
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size) &&
-            (new_size >= 16384) && ((new_size & 4095) == 0)) {
-            mic_stack_size = new_size;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_mic_init3);
-        }
-    }
-
-    // proxy I/O
-    env_var = getenv("MIC_PROXY_IO");
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            mic_proxy_io = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
-        }
-    }
-    env_var = getenv("MIC_PROXY_FS_ROOT");
-    if (env_var != 0 && *env_var != '\0') {
-        mic_proxy_fs_root = strdup(env_var);
-    }
-
-    // Prepare environment for the target process using the following
-    // rules
-    // - If MIC_ENV_PREFIX is set then any environment variable on the
-    //   host which has that prefix are copied to the device without
-    //   the prefix.
-    //   All other host environment variables are ignored.
-    // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
-    //   environment is duplicated.
-    env_var = getenv("MIC_ENV_PREFIX");
-    if (env_var != 0 && *env_var != '\0') {
-        mic_env_vars.set_prefix(env_var);
-
-        int len = strlen(env_var);
-        for (int i = 0; environ[i] != 0; i++) {
-            if (strncmp(environ[i], env_var, len) == 0 &&
-                strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
-                environ[i][len] != '=') {
-                mic_env_vars.analyze_env_var(environ[i]);
-            }
-        }
-    }
-
-    // create key for thread data
-    if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
-        LIBOFFLOAD_ERROR(c_mic_init4, errno);
-        return;
-    }
-
-    // cpu frequency
-    cpu_frequency = COI::PerfGetCycleFrequency();
-
-    env_var = getenv(mic_use_2mb_buffers_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            __offload_use_2mb_buffers = new_size;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_value,
-                             mic_use_2mb_buffers_envname);
-        }
-    }
-
-    env_var = getenv(mic_use_async_buffer_write_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            __offload_use_async_buffer_write = new_size;
-        }
-    }
-
-    env_var = getenv(mic_use_async_buffer_read_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        uint64_t new_size;
-        if (__offload_parse_size_string(env_var, new_size)) {
-            __offload_use_async_buffer_read = new_size;
-        }
-    }
-
-    // mic initialization type
-    env_var = getenv(offload_init_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        if (strcmp(env_var, "on_offload") == 0) {
-            __offload_init_type = c_init_on_offload;
-        }
-        else if (strcmp(env_var, "on_offload_all") == 0) {
-            __offload_init_type = c_init_on_offload_all;
-        }
-#ifndef TARGET_WINNT
-        else if (strcmp(env_var, "on_start") == 0) {
-            __offload_init_type = c_init_on_start;
-        }
-#endif // TARGET_WINNT
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
-        }
-    }
-
-    // active wait
-    env_var = getenv(offload_active_wait_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            __offload_active_wait = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
-                             offload_active_wait_envname);
-        }
-    }
-
-    // omp device num
-    env_var = getenv(omp_device_num_envname);
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
-            __omp_device_num = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
-                             omp_device_num_envname);
-        }
-    }
-
-    // init ORSL
-    ORSL::init();
-}
-
-extern int __offload_init_library(void)
-{
-    // do one time intialization
-    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
-    __offload_run_once(&ctrl, __offload_init_library_once);
-
-    // offload is available if COI is available and the number of devices > 0
-    bool is_available = COI::is_available && (mic_engines_total > 0);
-
-    // register pending libraries if there are any
-    if (is_available && __target_libs) {
-        mutex_locker_t locker(__target_libs_lock);
-
-        for (TargetImageList::iterator it = __target_libs_list.begin();
-             it != __target_libs_list.end(); it++) {
-            // Register library in COI
-            COI::ProcessRegisterLibraries(1, &it->data, &it->size,
-                                          &it->origin, &it->offset);
-
-            // add lib to all engines
-            for (int i = 0; i < mic_engines_total; i++) {
-                mic_engines[i].add_lib(*it);
-            }
-        }
-
-        __target_libs = false;
-        __target_libs_list.clear();
-    }
-
-    return is_available;
-}
-
-extern "C" void __offload_register_image(const void *target_image)
-{
-    const struct Image *image = static_cast<const struct Image*>(target_image);
-
-    // decode image
-    const char *name = image->data;
-    const void *data = image->data + strlen(image->data) + 1;
-    uint64_t    size = image->size;
-    const char *origin = 0;
-    uint64_t    offset = 0;
-
-    // our actions depend on the image type
-    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
-    switch (hdr->e_type) {
-        case ET_EXEC:
-            // Each offload application is supposed to have only one target
-            // image representing target executable.
-            // No thread synchronization is required here as the initialization
-            // code is always executed in a single thread.
-            if (__target_exe != 0) {
-                LIBOFFLOAD_ERROR(c_multiple_target_exes);
-                exit(1);
-            }
-            __target_exe = new TargetImage(name, data, size, origin, offset);
-
-            // Registration code for execs is always called from the context
-            // of main and thus we can safely call any function here,
-            // including LoadLibrary API on windows. This is the place where
-            // we do the offload library initialization.
-            if (__offload_init_library()) {
-                // initialize engine if init_type is on_start
-                if (__offload_init_type == c_init_on_start) {
-                    for (int i = 0; i < mic_engines_total; i++) {
-                        mic_engines[i].init();
-                    }
-                }
-            }
-            break;
-
-        case ET_DYN:
-            // Registration code for libraries is called from the DllMain
-            // context (on windows) and thus we cannot do anything useful
-            // here. So we just add it to the list of pending libraries for
-            // the later use.
-            __target_libs_lock.lock();
-            __target_libs = true;
-            __target_libs_list.push_back(TargetImage(name, data, size,
-                                                     origin, offset));
-            __target_libs_lock.unlock();
-            break;
-
-        default:
-            // something is definitely wrong, issue an error and exit
-            LIBOFFLOAD_ERROR(c_unknown_binary_type);
-            exit(1);
-    }
-}
-
-extern "C" void __offload_unregister_image(const void *target_image)
-{
-    // Target image is packed as follows:
-    //      8 bytes                - size of the target binary
-    //      null-terminated string - binary name
-    //      <size> bytes           - binary contents
-    const struct Image {
-         int64_t size;
-         char data[];
-    } *image = static_cast<const struct Image*>(target_image);
-
-    // decode image
-    const char *name = image->data;
-    const void *data = image->data + strlen(image->data) + 1;
-
-    // our actions depend on the image type
-    const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
-    if (hdr->e_type == ET_EXEC) {
-        // We are executing exec's desctructors.
-        // It is time to do a library cleanup.
-        if (timer_enabled) {
-            Offload_Timer_Print();
-        }
-
-#ifdef MYO_SUPPORT
-        __offload_myoFini();
-#endif // MYO_SUPPORT
-
-        __offload_fini_library();
-    }
-}
-
-// Runtime trace interface for user programs
-
-void __offload_console_trace(int level)
-{
-    console_enabled = level;
-}
-
-// User-visible offload API
-
-int _Offload_number_of_devices(void)
-{
-    __offload_init_library();
-    return mic_engines_total;
-}
-
-int _Offload_get_device_number(void)
-{
-    return -1;
-}
-
-int _Offload_get_physical_device_number(void)
-{
-    return -1;
-}
-
-int _Offload_signaled(int index, void *signal)
-{
-    __offload_init_library();
-
-    // check index value
-    if (index < 0 || mic_engines_total <= 0) {
-        LIBOFFLOAD_ERROR(c_offload_signaled1, index);
-        LIBOFFLOAD_ABORT;
-    }
-
-    // find associated async task
-    OffloadDescriptor *task =
-        mic_engines[index % mic_engines_total].find_signal(signal, false);
-    if (task == 0) {
-        LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
-        LIBOFFLOAD_ABORT;
-    }
-
-    return task->is_signaled();
-}
-
-void _Offload_report(int val)
-{
-    if (val == OFFLOAD_REPORT_ON ||
-        val == OFFLOAD_REPORT_OFF) {
-        offload_report_enabled = val;
-    }
-}
-
-// IDB support
-int   __dbg_is_attached = 0;
-int   __dbg_target_id = -1;
-pid_t __dbg_target_so_pid = -1;
-char  __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
-const int __dbg_api_major_version = 1;
-const int __dbg_api_minor_version = 0;
-
-void __dbg_target_so_loaded()
-{
-}
-void __dbg_target_so_unloaded()
-{
-}
diff --git a/offload/src/offload_host.h b/offload/src/offload_host.h
deleted file mode 100644
index ea2399635..000000000
--- a/offload/src/offload_host.h
+++ /dev/null
@@ -1,343 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief The parts of the runtime library used only on the host
-*/
-
-#ifndef OFFLOAD_HOST_H_INCLUDED
-#define OFFLOAD_HOST_H_INCLUDED
-
-#ifndef TARGET_WINNT
-#include <unistd.h>
-#endif // TARGET_WINNT
-#include "offload_common.h"
-#include "offload_util.h"
-#include "offload_engine.h"
-#include "offload_env.h"
-#include "offload_orsl.h"
-#include "coi/coi_client.h"
-
-// MIC engines.
-extern Engine*  mic_engines;
-extern uint32_t mic_engines_total;
-
-//! The target image is packed as follows.
-/*!      1. 8 bytes containing the size of the target binary          */
-/*!      2. a null-terminated string which is the binary name         */
-/*!      3. <size> number of bytes that are the contents of the image */
-/*!      The address of symbol __offload_target_image
-             is the address of this structure.                        */
-struct Image {
-     int64_t size; //!< Size in bytes of the target binary name and contents
-     char data[];  //!< The name and contents of the target image
-};
-
-// The offload descriptor.
-class OffloadDescriptor
-{
-public:
-    OffloadDescriptor(
-        int index,
-        _Offload_status *status,
-        bool is_mandatory,
-        bool is_openmp,
-        OffloadHostTimerData * timer_data
-    ) :
-        m_device(mic_engines[index % mic_engines_total]),
-        m_is_mandatory(is_mandatory),
-        m_is_openmp(is_openmp),
-        m_inout_buf(0),
-        m_func_desc(0),
-        m_func_desc_size(0),
-        m_in_deps(0),
-        m_in_deps_total(0),
-        m_out_deps(0),
-        m_out_deps_total(0),
-        m_vars(0),
-        m_vars_extra(0),
-        m_status(status),
-        m_timer_data(timer_data)
-    {}
-
-    ~OffloadDescriptor()
-    {
-        if (m_in_deps != 0) {
-            free(m_in_deps);
-        }
-        if (m_out_deps != 0) {
-            free(m_out_deps);
-        }
-        if (m_func_desc != 0) {
-            free(m_func_desc);
-        }
-        if (m_vars != 0) {
-            free(m_vars);
-            free(m_vars_extra);
-        }
-    }
-
-    bool offload(const char *name, bool is_empty,
-                 VarDesc *vars, VarDesc2 *vars2, int vars_total,
-                 const void **waits, int num_waits, const void **signal,
-                 int entry_id, const void *stack_addr);
-    bool offload_finish();
-
-    bool is_signaled();
-
-    OffloadHostTimerData* get_timer_data() const {
-        return m_timer_data;
-    }
-
-private:
-    bool wait_dependencies(const void **waits, int num_waits);
-    bool setup_descriptors(VarDesc *vars, VarDesc2 *vars2, int vars_total,
-                           int entry_id, const void *stack_addr);
-    bool setup_misc_data(const char *name);
-    bool send_pointer_data(bool is_async);
-    bool send_noncontiguous_pointer_data(
-        int i,
-        PtrData* src_buf,
-        PtrData* dst_buf,
-        COIEVENT *event);
-    bool receive_noncontiguous_pointer_data(
-        int i,
-        char* src_data,
-        COIBUFFER dst_buf,
-        COIEVENT *event);
-
-    bool gather_copyin_data();
-
-    bool compute();
-
-    bool receive_pointer_data(bool is_async);
-    bool scatter_copyout_data();
-
-    void cleanup();
-
-    bool find_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
-                       int64_t length, bool error_does_not_exist = true);
-    bool alloc_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
-                        int64_t length, int64_t alloc_disp, int align);
-    bool init_static_ptr_data(PtrData *ptr_data);
-    bool init_mic_address(PtrData *ptr_data);
-    bool offload_stack_memory_manager(const void * stack_begin, int routine_id,
-                                      int buf_size, int align, bool *is_new);
-    bool nullify_target_stack(COIBUFFER targ_buf, uint64_t size);
-
-    bool gen_var_descs_for_pointer_array(int i);
-
-    void report_coi_error(error_types msg, COIRESULT res);
-    _Offload_result translate_coi_error(COIRESULT res) const;
-
-private:
-    typedef std::list<COIBUFFER> BufferList;
-
-    // extra data associated with each variable descriptor
-    struct VarExtra {
-        PtrData* src_data;
-        PtrData* dst_data;
-        AutoData* auto_data;
-        int64_t cpu_disp;
-        int64_t cpu_offset;
-        CeanReadRanges *read_rng_src;
-        CeanReadRanges *read_rng_dst;
-        int64_t ptr_arr_offset;
-        bool is_arr_ptr_el;
-    };
-
-    template<typename T> class ReadArrElements {
-    public:
-        ReadArrElements():
-            ranges(NULL),
-            el_size(sizeof(T)),
-            offset(0),
-            count(0),
-            is_empty(true),
-            base(NULL)
-        {}
-
-        bool read_next(bool flag)
-        {
-            if (flag != 0) {
-                if (is_empty) {
-                    if (ranges) {
-                        if (!get_next_range(ranges, &offset)) {
-                            // ranges are over
-                            return false;
-                        }
-                    }
-                    // all contiguous elements are over
-                    else if (count != 0) {
-                        return false;
-                    }
-
-                    length_cur = size;
-                }
-                else {
-                    offset += el_size;
-                }
-                val = (T)get_el_value(base, offset, el_size);
-                length_cur -= el_size;
-                count++;
-                is_empty = length_cur == 0;
-            }
-            return true;
-        }
-    public:
-        CeanReadRanges * ranges;
-        T       val;
-        int     el_size;
-        int64_t size,
-                offset,
-                length_cur;
-        bool    is_empty;
-        int     count;
-        char   *base;
-    };
-
-    // ptr_data for persistent auto objects
-    PtrData*    m_stack_ptr_data;
-    PtrDataList m_destroy_stack;
-
-    // Engine
-    Engine& m_device;
-
-    // if true offload is mandatory
-    bool m_is_mandatory;
-
-    // if true offload has openmp origin
-    const bool m_is_openmp;
-
-    // The Marshaller for the inputs of the offloaded region.
-    Marshaller m_in;
-
-    // The Marshaller for the outputs of the offloaded region.
-    Marshaller m_out;
-
-    // List of buffers that are passed to dispatch call
-    BufferList m_compute_buffers;
-
-    // List of buffers that need to be destroyed at the end of offload
-    BufferList m_destroy_buffers;
-
-    // Variable descriptors
-    VarDesc*  m_vars;
-    VarExtra* m_vars_extra;
-    int       m_vars_total;
-
-    // Pointer to a user-specified status variable
-    _Offload_status *m_status;
-
-    // Function descriptor
-    FunctionDescriptor* m_func_desc;
-    uint32_t            m_func_desc_size;
-
-    // Buffer for transferring copyin/copyout data
-    COIBUFFER m_inout_buf;
-
-    // Dependencies
-    COIEVENT *m_in_deps;
-    uint32_t  m_in_deps_total;
-    COIEVENT *m_out_deps;
-    uint32_t  m_out_deps_total;
-
-    // Timer data
-    OffloadHostTimerData *m_timer_data;
-
-    // copyin/copyout data length
-    uint64_t m_in_datalen;
-    uint64_t m_out_datalen;
-
-    // a boolean value calculated in setup_descriptors. If true we need to do
-    // a run function on the target. Otherwise it may be optimized away.
-    bool m_need_runfunction;
-};
-
-// Initialization types for MIC
-enum OffloadInitType {
-    c_init_on_start,         // all devices before entering main
-    c_init_on_offload,       // single device before starting the first offload
-    c_init_on_offload_all    // all devices before starting the first offload
-};
-
-// Initializes library and registers specified offload image.
-extern "C" void __offload_register_image(const void* image);
-extern "C" void __offload_unregister_image(const void* image);
-
-// Initializes offload runtime library.
-extern int __offload_init_library(void);
-
-// thread data for associating pipelines with threads
-extern pthread_key_t mic_thread_key;
-
-// Environment variables for devices
-extern MicEnvVar mic_env_vars;
-
-// CPU frequency
-extern uint64_t cpu_frequency;
-
-// LD_LIBRARY_PATH for MIC libraries
-extern char* mic_library_path;
-
-// stack size for target
-extern uint32_t mic_stack_size;
-
-// Preallocated memory size for buffers on MIC
-extern uint64_t mic_buffer_size;
-
-// Setting controlling inout proxy
-extern bool  mic_proxy_io;
-extern char* mic_proxy_fs_root;
-
-// Threshold for creating buffers with large pages
-extern uint64_t __offload_use_2mb_buffers;
-
-// offload initialization type
-extern OffloadInitType __offload_init_type;
-
-// Device number to offload to when device is not explicitly specified.
-extern int __omp_device_num;
-
-// target executable
-extern TargetImage* __target_exe;
-
-// IDB support
-
-// Called by the offload runtime after initialization of offload infrastructure
-// has been completed.
-extern "C" void  __dbg_target_so_loaded();
-
-// Called by the offload runtime when the offload infrastructure is about to be
-// shut down, currently at application exit.
-extern "C" void  __dbg_target_so_unloaded();
-
-// Null-terminated string containing path to the process image of the hosting
-// application (offload_main)
-#define MAX_TARGET_NAME 512
-extern "C" char  __dbg_target_exe_name[MAX_TARGET_NAME];
-
-// Integer specifying the process id
-extern "C" pid_t __dbg_target_so_pid;
-
-// Integer specifying the 0-based device number
-extern "C" int   __dbg_target_id;
-
-// Set to non-zero by the host-side debugger to enable offload debugging
-// support
-extern "C" int   __dbg_is_attached;
-
-// Major version of the debugger support API
-extern "C" const int __dbg_api_major_version;
-
-// Minor version of the debugger support API
-extern "C" const int __dbg_api_minor_version;
-
-#endif // OFFLOAD_HOST_H_INCLUDED
diff --git a/offload/src/offload_myo_host.cpp b/offload/src/offload_myo_host.cpp
deleted file mode 100644
index 2e1c186a5..000000000
--- a/offload/src/offload_myo_host.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_myo_host.h"
-#include <errno.h>
-#include <malloc.h>
-#include "offload_host.h"
-
-#if defined(LINUX) || defined(FREEBSD)
-#include <mm_malloc.h>
-#endif
-
-#define MYO_VERSION1    "MYO_1.0"
-
-extern "C" void __cilkrts_cilk_for_32(void*, void*, uint32_t, int32_t);
-extern "C" void __cilkrts_cilk_for_64(void*, void*, uint64_t, int32_t);
-
-#ifndef TARGET_WINNT
-#pragma weak __cilkrts_cilk_for_32
-#pragma weak __cilkrts_cilk_for_64
-#endif // TARGET_WINNT
-
-#ifdef TARGET_WINNT
-#define MYO_TABLE_END_MARKER() reinterpret_cast<const char*>(-1)
-#else // TARGET_WINNT
-#define MYO_TABLE_END_MARKER() reinterpret_cast<const char*>(0)
-#endif // TARGET_WINNT
-
-class MyoWrapper {
-public:
-    MyoWrapper() : m_lib_handle(0), m_is_available(false)
-    {}
-
-    bool is_available() const {
-        return m_is_available;
-    }
-
-    bool LoadLibrary(void);
-
-    // unloads the library
-    void UnloadLibrary(void) {
-//        if (m_lib_handle != 0) {
-//            DL_close(m_lib_handle);
-//            m_lib_handle = 0;
-//        }
-    }
-
-    // Wrappers for MYO client functions
-    void LibInit(void *arg, void *func) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoinit,
-                                 "%s(%p, %p)\n", __func__, arg, func);
-        CheckResult(__func__, m_lib_init(arg, func));
-    }
-
-    void LibFini(void) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myofini, "%s()\n", __func__);
-        m_lib_fini();
-    }
-
-    void* SharedMalloc(size_t size) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedmalloc,
-                                 "%s(%lld)\n", __func__, size);
-        return m_shared_malloc(size);
-    }
-
-    void SharedFree(void *ptr) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedfree,
-                                 "%s(%p)\n", __func__, ptr);
-        m_shared_free(ptr);
-    }
-
-    void* SharedAlignedMalloc(size_t size, size_t align) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedmalloc,
-                                 "%s(%lld, %lld)\n", __func__, size, align);
-        return m_shared_aligned_malloc(size, align);
-    }
-
-    void SharedAlignedFree(void *ptr) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myosharedalignedfree,
-                              "%s(%p)\n", __func__, ptr);
-        m_shared_aligned_free(ptr);
-    }
-
-    void Acquire(void) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoacquire,
-                              "%s()\n", __func__);
-        CheckResult(__func__, m_acquire());
-    }
-
-    void Release(void) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myorelease,
-                            "%s()\n", __func__);
-        CheckResult(__func__, m_release());
-    }
-
-    void HostVarTablePropagate(void *table, int num_entries) const {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%p, %d)\n", __func__, table, num_entries);
-        CheckResult(__func__, m_host_var_table_propagate(table, num_entries));
-    }
-
-    void HostFptrTableRegister(void *table, int num_entries,
-                               int ordered) const {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_myoregister,
-                            "%s(%p, %d, %d)\n", __func__, table,
-                            num_entries, ordered);
-        CheckResult(__func__,
-                    m_host_fptr_table_register(table, num_entries, ordered));
-    }
-
-    void RemoteThunkCall(void *thunk, void *args, int device) {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%p, %p, %d)\n", __func__, thunk, args,
-                            device);
-        CheckResult(__func__, m_remote_thunk_call(thunk, args, device));
-    }
-
-    MyoiRFuncCallHandle RemoteCall(char *func, void *args, int device) const {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%s, %p, %d)\n", __func__, func, args,
-                            device);
-        return m_remote_call(func, args, device);
-    }
-
-    void GetResult(MyoiRFuncCallHandle handle) const {
-        OFFLOAD_DEBUG_TRACE(4, "%s(%p)\n", __func__, handle);
-        CheckResult(__func__, m_get_result(handle));
-    }
-
-private:
-    void CheckResult(const char *func, MyoError error) const {
-        if (error != MYO_SUCCESS) {
-             LIBOFFLOAD_ERROR(c_myowrapper_checkresult, func, error);
-            exit(1);
-        }
-    }
-
-private:
-    void* m_lib_handle;
-    bool  m_is_available;
-
-    // pointers to functions from myo library
-    MyoError (*m_lib_init)(void*, void*);
-    void     (*m_lib_fini)(void);
-    void*    (*m_shared_malloc)(size_t);
-    void     (*m_shared_free)(void*);
-    void*    (*m_shared_aligned_malloc)(size_t, size_t);
-    void     (*m_shared_aligned_free)(void*);
-    MyoError (*m_acquire)(void);
-    MyoError (*m_release)(void);
-    MyoError (*m_host_var_table_propagate)(void*, int);
-    MyoError (*m_host_fptr_table_register)(void*, int, int);
-    MyoError (*m_remote_thunk_call)(void*, void*, int);
-    MyoiRFuncCallHandle (*m_remote_call)(char*, void*, int);
-    MyoError (*m_get_result)(MyoiRFuncCallHandle);
-};
-
-bool MyoWrapper::LoadLibrary(void)
-{
-#ifndef TARGET_WINNT
-    const char *lib_name = "libmyo-client.so";
-#else // TARGET_WINNT
-    const char *lib_name = "myo-client.dll";
-#endif // TARGET_WINNT
-
-    OFFLOAD_DEBUG_TRACE(2, "Loading MYO library %s ...\n", lib_name);
-
-    m_lib_handle = DL_open(lib_name);
-    if (m_lib_handle == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to load the library. errno = %d\n",
-                            errno);
-        return false;
-    }
-
-    m_lib_init = (MyoError (*)(void*, void*))
-        DL_sym(m_lib_handle, "myoiLibInit", MYO_VERSION1);
-    if (m_lib_init == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiLibInit");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_lib_fini = (void (*)(void))
-        DL_sym(m_lib_handle, "myoiLibFini", MYO_VERSION1);
-    if (m_lib_fini == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiLibFini");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_malloc = (void* (*)(size_t))
-        DL_sym(m_lib_handle, "myoSharedMalloc", MYO_VERSION1);
-    if (m_shared_malloc == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedMalloc");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_free = (void (*)(void*))
-        DL_sym(m_lib_handle, "myoSharedFree", MYO_VERSION1);
-    if (m_shared_free == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedFree");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_aligned_malloc = (void* (*)(size_t, size_t))
-        DL_sym(m_lib_handle, "myoSharedAlignedMalloc", MYO_VERSION1);
-    if (m_shared_aligned_malloc == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedAlignedMalloc");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_shared_aligned_free = (void (*)(void*))
-        DL_sym(m_lib_handle, "myoSharedAlignedFree", MYO_VERSION1);
-    if (m_shared_aligned_free == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoSharedAlignedFree");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_acquire = (MyoError (*)(void))
-        DL_sym(m_lib_handle, "myoAcquire", MYO_VERSION1);
-    if (m_acquire == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoAcquire");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_release = (MyoError (*)(void))
-        DL_sym(m_lib_handle, "myoRelease", MYO_VERSION1);
-    if (m_release == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoRelease");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_host_var_table_propagate = (MyoError (*)(void*, int))
-        DL_sym(m_lib_handle, "myoiHostVarTablePropagate", MYO_VERSION1);
-    if (m_host_var_table_propagate == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiHostVarTablePropagate");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_host_fptr_table_register = (MyoError (*)(void*, int, int))
-        DL_sym(m_lib_handle, "myoiHostFptrTableRegister", MYO_VERSION1);
-    if (m_host_fptr_table_register == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiHostFptrTableRegister");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_remote_thunk_call = (MyoError (*)(void*, void*, int))
-        DL_sym(m_lib_handle, "myoiRemoteThunkCall", MYO_VERSION1);
-    if (m_remote_thunk_call == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiRemoteThunkCall");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_remote_call = (MyoiRFuncCallHandle (*)(char*, void*, int))
-        DL_sym(m_lib_handle, "myoiRemoteCall", MYO_VERSION1);
-    if (m_remote_call == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiRemoteCall");
-        UnloadLibrary();
-        return false;
-    }
-
-    m_get_result = (MyoError (*)(MyoiRFuncCallHandle))
-        DL_sym(m_lib_handle, "myoiGetResult", MYO_VERSION1);
-    if (m_get_result == 0) {
-        OFFLOAD_DEBUG_TRACE(2, "Failed to find %s in MYO library\n",
-                            "myoiGetResult");
-        UnloadLibrary();
-        return false;
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "The library was successfully loaded\n");
-
-    m_is_available = true;
-
-    return true;
-}
-
-static bool myo_is_available;
-static MyoWrapper myo_wrapper;
-
-struct MyoTable
-{
-    MyoTable(SharedTableEntry *tab, int len) : var_tab(tab), var_tab_len(len)
-    {}
-
-    SharedTableEntry*   var_tab;
-    int                 var_tab_len;
-};
-
-typedef std::list<MyoTable> MyoTableList;
-static MyoTableList __myo_table_list;
-static mutex_t      __myo_table_lock;
-static bool         __myo_tables = false;
-
-static void __offload_myo_shared_table_register(SharedTableEntry *entry);
-static void __offload_myo_shared_init_table_register(InitTableEntry* entry);
-static void __offload_myo_fptr_table_register(FptrTableEntry *entry);
-
-static void __offload_myoLoadLibrary_once(void)
-{
-    if (__offload_init_library()) {
-        myo_wrapper.LoadLibrary();
-    }
-}
-
-static bool __offload_myoLoadLibrary(void)
-{
-    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
-    __offload_run_once(&ctrl, __offload_myoLoadLibrary_once);
-
-    return myo_wrapper.is_available();
-}
-
-static void __offload_myoInit_once(void)
-{
-    if (!__offload_myoLoadLibrary()) {
-        return;
-    }
-
-    // initialize all devices
-    for (int i = 0; i < mic_engines_total; i++) {
-        mic_engines[i].init();
-    }
-
-    // load and initialize MYO library
-    OFFLOAD_DEBUG_TRACE(2, "Initializing MYO library ...\n");
-
-    COIEVENT events[MIC_ENGINES_MAX];
-    MyoiUserParams params[MIC_ENGINES_MAX+1];
-
-    // load target library to all devices
-    for (int i = 0; i < mic_engines_total; i++) {
-        mic_engines[i].init_myo(&events[i]);
-
-        params[i].type = MYOI_USERPARAMS_DEVID;
-        params[i].nodeid = mic_engines[i].get_physical_index() + 1;
-    }
-
-    params[mic_engines_total].type = MYOI_USERPARAMS_LAST_MSG;
-
-    // initialize myo runtime on host
-    myo_wrapper.LibInit(params, 0);
-
-    // wait for the target init calls to finish
-    COIRESULT res;
-    res = COI::EventWait(mic_engines_total, events, -1, 1, 0, 0);
-    if (res != COI_SUCCESS) {
-        LIBOFFLOAD_ERROR(c_event_wait, res);
-        exit(1);
-    }
-
-    myo_is_available = true;
-
-    OFFLOAD_DEBUG_TRACE(2, "Initializing MYO library ... done\n");
-}
-
-static bool __offload_myoInit(void)
-{
-    static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
-    __offload_run_once(&ctrl, __offload_myoInit_once);
-
-    // register pending shared var tables
-    if (myo_is_available && __myo_tables) {
-        mutex_locker_t locker(__myo_table_lock);
-
-        if (__myo_tables) {
-            //  Register tables with MYO so it can propagate to target.
-            for(MyoTableList::const_iterator it = __myo_table_list.begin();
-                it != __myo_table_list.end(); ++it) {
-#ifdef TARGET_WINNT
-                for (SharedTableEntry *entry = it->var_tab;
-                     entry->varName != MYO_TABLE_END_MARKER(); entry++) {
-                    if (entry->varName == 0) {
-                        continue;
-                    }
-                    myo_wrapper.HostVarTablePropagate(entry, 1);
-                }
-#else // TARGET_WINNT
-                myo_wrapper.HostVarTablePropagate(it->var_tab,
-                                                  it->var_tab_len);
-#endif // TARGET_WINNT
-            }
-
-            __myo_table_list.clear();
-            __myo_tables = false;
-        }
-    }
-
-    return myo_is_available;
-}
-
-static bool shared_table_entries(
-    SharedTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    for (; entry->varName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->varName == 0) {
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        return true;
-    }
-
-    return false;
-}
-
-static bool fptr_table_entries(
-    FptrTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->funcName == 0) {
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        return true;
-    }
-
-    return false;
-}
-
-extern "C" void __offload_myoRegisterTables(
-    InitTableEntry* init_table,
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-)
-{
-    // check whether we need to initialize MYO library. It is
-    // initialized only if at least one myo table is not empty
-    if (shared_table_entries(shared_table) || fptr_table_entries(fptr_table)) {
-        // make sure myo library is loaded
-        __offload_myoLoadLibrary();
-
-        // register tables
-        __offload_myo_shared_table_register(shared_table);
-        __offload_myo_fptr_table_register(fptr_table);
-        __offload_myo_shared_init_table_register(init_table);
-    }
-}
-
-void __offload_myoFini(void)
-{
-    if (myo_is_available) {
-        OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-        COIEVENT events[MIC_ENGINES_MAX];
-
-        // kick off myoiLibFini calls on all devices
-        for (int i = 0; i < mic_engines_total; i++) {
-            mic_engines[i].fini_myo(&events[i]);
-        }
-
-        // cleanup myo runtime on host
-        myo_wrapper.LibFini();
-
-        // wait for the target fini calls to finish
-        COIRESULT res;
-        res = COI::EventWait(mic_engines_total, events, -1, 1, 0, 0);
-        if (res != COI_SUCCESS) {
-            LIBOFFLOAD_ERROR(c_event_wait, res);
-            exit(1);
-        }
-    }
-}
-
-static void __offload_myo_shared_table_register(
-    SharedTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    SharedTableEntry *start = entry;
-    int entries = 0;
-
-    // allocate shared memory for vars
-    for (; entry->varName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->varName == 0) {
-            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoSharedTable entry\n");
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        OFFLOAD_DEBUG_TRACE(4, "registering MyoSharedTable entry for %s @%p\n",
-                            entry->varName, entry);
-
-        // Invoke the function to create shared memory
-        reinterpret_cast<void(*)(void)>(entry->sharedAddr)();
-        entries++;
-    }
-
-    // and table to the list if it is not empty
-    if (entries > 0) {
-        mutex_locker_t locker(__myo_table_lock);
-        __myo_table_list.push_back(MyoTable(start, entries));
-        __myo_tables = true;
-    }
-}
-
-static void __offload_myo_shared_init_table_register(InitTableEntry* entry)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-#ifdef TARGET_WINNT
-    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
-        if (entry->funcName == 0) {
-            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoSharedInit entry\n");
-            continue;
-        }
-
-        //  Invoke the function to init the shared memory
-        entry->func();
-    }
-#else // TARGET_WINNT
-    for (; entry->func != 0; entry++) {
-        // Invoke the function to init the shared memory
-        entry->func();
-    }
-#endif // TARGET_WINNT
-}
-
-static void __offload_myo_fptr_table_register(
-    FptrTableEntry *entry
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    FptrTableEntry *start = entry;
-    int entries = 0;
-
-    for (; entry->funcName != MYO_TABLE_END_MARKER(); entry++) {
-#ifdef TARGET_WINNT
-        if (entry->funcName == 0) {
-            OFFLOAD_DEBUG_TRACE(4, "skip registering a NULL MyoFptrTable entry\n");
-            continue;
-        }
-#endif // TARGET_WINNT
-
-        if (!myo_wrapper.is_available()) {
-            *(static_cast<void**>(entry->localThunkAddr)) = entry->funcAddr;
-        }
-
-        OFFLOAD_DEBUG_TRACE(4, "registering MyoFptrTable entry for %s @%p\n",
-                            entry->funcName, entry);
-
-#ifdef TARGET_WINNT
-        if (myo_wrapper.is_available()) {
-            myo_wrapper.HostFptrTableRegister(entry, 1, false);
-        }
-#endif // TARGET_WINNT
-
-        entries++;
-    }
-
-#ifndef TARGET_WINNT
-    if (myo_wrapper.is_available() && entries > 0) {
-        myo_wrapper.HostFptrTableRegister(start, entries, false);
-    }
-#endif // TARGET_WINNT
-}
-
-extern "C" int __offload_myoIsAvailable(int target_number)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%d)\n", __func__, target_number);
-
-    if (target_number >= -2) {
-        bool is_default_number = (target_number == -2);
-
-        if (__offload_myoInit()) {
-            if (target_number >= 0) {
-                // User provided the device number
-                int num = target_number % mic_engines_total;
-
-                // reserve device in ORSL
-                target_number = ORSL::reserve(num) ? num : -1;
-            }
-            else {
-                // try to use device 0
-                target_number = ORSL::reserve(0) ? 0 : -1;
-            }
-
-            // make sure device is initialized
-            if (target_number >= 0) {
-                mic_engines[target_number].init();
-            }
-        }
-        else {
-            // fallback to CPU
-            target_number = -1;
-        }
-
-        if (target_number < 0 && !is_default_number) {
-            LIBOFFLOAD_ERROR(c_device_is_not_available);
-            exit(1);
-        }
-    }
-    else {
-        LIBOFFLOAD_ERROR(c_invalid_device_number);
-        exit(1);
-    }
-
-    return target_number;
-}
-
-extern "C" void __offload_myoiRemoteIThunkCall(
-    void *thunk,
-    void *arg,
-    int target_number
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p, %p, %d)\n", __func__, thunk, arg,
-                        target_number);
-
-    myo_wrapper.Release();
-    myo_wrapper.RemoteThunkCall(thunk, arg, target_number);
-    myo_wrapper.Acquire();
-
-    ORSL::release(target_number);
-}
-
-extern "C" void* _Offload_shared_malloc(size_t size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld)\n", __func__, size);
-
-    if (__offload_myoLoadLibrary()) {
-        return myo_wrapper.SharedMalloc(size);
-    }
-    else {
-        return malloc(size);
-    }
-}
-
-extern "C" void _Offload_shared_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-
-    if (__offload_myoLoadLibrary()) {
-        myo_wrapper.SharedFree(ptr);
-    }
-    else {
-        free(ptr);
-    }
-}
-
-extern "C" void* _Offload_shared_aligned_malloc(size_t size, size_t align)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld, %lld)\n", __func__, size, align);
-
-    if (__offload_myoLoadLibrary()) {
-        return myo_wrapper.SharedAlignedMalloc(size, align);
-    }
-    else {
-        if (align < sizeof(void*)) {
-            align = sizeof(void*);
-        }
-        return _mm_malloc(size, align);
-    }
-}
-
-extern "C" void _Offload_shared_aligned_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-
-    if (__offload_myoLoadLibrary()) {
-        myo_wrapper.SharedAlignedFree(ptr);
-    }
-    else {
-        _mm_free(ptr);
-    }
-}
-
-extern "C" void __intel_cilk_for_32_offload(
-    int size,
-    void (*copy_constructor)(void*, void*),
-    int target_number,
-    void *raddr,
-    void *closure_object,
-    unsigned int iters,
-    unsigned int grain_size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    target_number = __offload_myoIsAvailable(target_number);
-    if (target_number >= 0) {
-        struct S {
-            void *M1;
-            unsigned int M2;
-            unsigned int M3;
-            char closure[];
-        } *args;
-
-        args = (struct S*) _Offload_shared_malloc(sizeof(struct S) + size);
-        args->M1 = raddr;
-        args->M2 = iters;
-        args->M3 = grain_size;
-
-        if (copy_constructor == 0) {
-            memcpy(args->closure, closure_object, size);
-        }
-        else {
-            copy_constructor(args->closure, closure_object);
-        }
-
-        myo_wrapper.Release();
-        myo_wrapper.GetResult(
-            myo_wrapper.RemoteCall("__intel_cilk_for_32_offload",
-                                   args, target_number)
-        );
-        myo_wrapper.Acquire();
-
-        _Offload_shared_free(args);
-
-        ORSL::release(target_number);
-    }
-    else {
-        __cilkrts_cilk_for_32(raddr,
-                              closure_object,
-                              iters,
-                              grain_size);
-    }
-}
-
-extern "C" void __intel_cilk_for_64_offload(
-    int size,
-    void (*copy_constructor)(void*, void*),
-    int target_number,
-    void *raddr,
-    void *closure_object,
-    uint64_t iters,
-    uint64_t grain_size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    target_number = __offload_myoIsAvailable(target_number);
-    if (target_number >= 0) {
-        struct S {
-            void *M1;
-            uint64_t M2;
-            uint64_t M3;
-            char closure[];
-        } *args;
-
-        args = (struct S*) _Offload_shared_malloc(sizeof(struct S) + size);
-        args->M1 = raddr;
-        args->M2 = iters;
-        args->M3 = grain_size;
-
-        if (copy_constructor == 0) {
-            memcpy(args->closure, closure_object, size);
-        }
-        else {
-            copy_constructor(args->closure, closure_object);
-        }
-
-        myo_wrapper.Release();
-        myo_wrapper.GetResult(
-            myo_wrapper.RemoteCall("__intel_cilk_for_64_offload", args,
-                                   target_number)
-        );
-        myo_wrapper.Acquire();
-
-        _Offload_shared_free(args);
-
-        ORSL::release(target_number);
-    }
-    else {
-        __cilkrts_cilk_for_64(raddr,
-                              closure_object,
-                              iters,
-                              grain_size);
-    }
-}
diff --git a/offload/src/offload_myo_host.h b/offload/src/offload_myo_host.h
deleted file mode 100644
index 92a61f48e..000000000
--- a/offload/src/offload_myo_host.h
+++ /dev/null
@@ -1,80 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_MYO_HOST_H_INCLUDED
-#define OFFLOAD_MYO_HOST_H_INCLUDED
-
-#include <myotypes.h>
-#include <myoimpl.h>
-#include <myo.h>
-#include "offload.h"
-
-typedef MyoiSharedVarEntry      SharedTableEntry;
-//typedef MyoiHostSharedFptrEntry FptrTableEntry;
-typedef struct {
-    //! Function Name
-    const char *funcName;
-    //! Function Address
-    void *funcAddr;
-    //! Local Thunk Address
-    void *localThunkAddr;
-#ifdef TARGET_WINNT
-    // Dummy to pad up to 32 bytes
-    void *dummy;
-#endif // TARGET_WINNT
-} FptrTableEntry;
-
-struct InitTableEntry {
-#ifdef TARGET_WINNT
-    // Dummy to pad up to 16 bytes
-    // Function Name
-    const char *funcName;
-#endif // TARGET_WINNT
-    void (*func)(void);
-};
-
-#ifdef TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable$a"
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable$z"
-
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START     ".MyoSharedInitTable$a"
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END       ".MyoSharedInitTable$z"
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable$a"
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable$z"
-#else  // TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable."
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable."
-
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START     ".MyoSharedInitTable."
-#define OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END       ".MyoSharedInitTable."
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable."
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable."
-#endif // TARGET_WINNT
-
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_END, read, write)
-
-extern "C" void __offload_myoRegisterTables(
-    InitTableEntry *init_table,
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-);
-
-extern void __offload_myoFini(void);
-
-#endif // OFFLOAD_MYO_HOST_H_INCLUDED
diff --git a/offload/src/offload_myo_target.cpp b/offload/src/offload_myo_target.cpp
deleted file mode 100644
index eeb1c4f27..000000000
--- a/offload/src/offload_myo_target.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_myo_target.h"
-#include "offload_target.h"
-
-extern "C" void __cilkrts_cilk_for_32(void*, void*, uint32_t, int32_t);
-extern "C" void __cilkrts_cilk_for_64(void*, void*, uint64_t, int32_t);
-
-#pragma weak __cilkrts_cilk_for_32
-#pragma weak __cilkrts_cilk_for_64
-
-static void CheckResult(const char *func, MyoError error) {
-    if (error != MYO_SUCCESS) {
-       LIBOFFLOAD_ERROR(c_myotarget_checkresult, func, error);
-        exit(1);
-    }
-}
-
-static void __offload_myo_shared_table_register(SharedTableEntry *entry)
-{
-    int entries = 0;
-    SharedTableEntry *t_start;
-
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    t_start = entry;
-    while (t_start->varName != 0) {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_shared,
-                              "myo shared entry name = \"%s\" addr = %p\n",
-                              t_start->varName, t_start->sharedAddr);
-        t_start++;
-        entries++;
-    }
-
-    if (entries > 0) {
-        OFFLOAD_DEBUG_TRACE(3, "myoiMicVarTableRegister(%p, %d)\n", entry,
-                            entries);
-        CheckResult("myoiMicVarTableRegister",
-                    myoiMicVarTableRegister(entry, entries));
-    }
-}
-
-static void __offload_myo_fptr_table_register(
-    FptrTableEntry *entry
-)
-{
-    int entries = 0;
-    FptrTableEntry *t_start;
-
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, entry);
-
-    t_start = entry;
-    while (t_start->funcName != 0) {
-        OFFLOAD_DEBUG_TRACE_1(4, 0, c_offload_mic_myo_fptr,
-                              "myo fptr entry name = \"%s\" addr = %p\n",
-                              t_start->funcName, t_start->funcAddr);
-        t_start++;
-        entries++;
-    }
-
-    if (entries > 0) {
-        OFFLOAD_DEBUG_TRACE(3, "myoiTargetFptrTableRegister(%p, %d, 0)\n",
-                            entry, entries);
-        CheckResult("myoiTargetFptrTableRegister",
-                    myoiTargetFptrTableRegister(entry, entries, 0));
-    }
-}
-
-extern "C" void __offload_myoAcquire(void)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-    CheckResult("myoAcquire", myoAcquire());
-}
-
-extern "C" void __offload_myoRelease(void)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-    CheckResult("myoRelease", myoRelease());
-}
-
-extern "C" void __intel_cilk_for_32_offload_wrapper(void *args_)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    struct S {
-        void *M1;
-        unsigned int M2;
-        unsigned int M3;
-        char closure[];
-    } *args = (struct S*) args_;
-
-    __cilkrts_cilk_for_32(args->M1, args->closure, args->M2, args->M3);
-}
-
-extern "C" void __intel_cilk_for_64_offload_wrapper(void *args_)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    struct S {
-        void *M1;
-        uint64_t M2;
-        uint64_t M3;
-        char closure[];
-    } *args = (struct S*) args_;
-
-    __cilkrts_cilk_for_64(args->M1, args->closure, args->M2, args->M3);
-}
-
-static void __offload_myo_once_init(void)
-{
-    CheckResult("myoiRemoteFuncRegister",
-                myoiRemoteFuncRegister(
-                    (MyoiRemoteFuncType) __intel_cilk_for_32_offload_wrapper,
-                    "__intel_cilk_for_32_offload"));
-    CheckResult("myoiRemoteFuncRegister",
-                myoiRemoteFuncRegister(
-                    (MyoiRemoteFuncType) __intel_cilk_for_64_offload_wrapper,
-                    "__intel_cilk_for_64_offload"));
-}
-
-extern "C" void __offload_myoRegisterTables(
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s\n", __func__);
-
-    // one time registration of Intel(R) Cilk(TM) language entries
-    static pthread_once_t once_control = PTHREAD_ONCE_INIT;
-    pthread_once(&once_control, __offload_myo_once_init);
-
-    // register module's tables
-    if (shared_table->varName == 0 && fptr_table->funcName == 0) {
-        return;
-    }
-
-    __offload_myo_shared_table_register(shared_table);
-    __offload_myo_fptr_table_register(fptr_table);
-}
-
-extern "C" void* _Offload_shared_malloc(size_t size)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld)\n", __func__, size);
-    return myoSharedMalloc(size);
-}
-
-extern "C" void _Offload_shared_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-    myoSharedFree(ptr);
-}
-
-extern "C" void* _Offload_shared_aligned_malloc(size_t size, size_t align)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%lld, %lld)\n", __func__, size, align);
-    return myoSharedAlignedMalloc(size, align);
-}
-
-extern "C" void _Offload_shared_aligned_free(void *ptr)
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s(%p)\n", __func__, ptr);
-    myoSharedAlignedFree(ptr);
-}
-
-// temporary workaround for blocking behavior of myoiLibInit/Fini calls
-extern "C" void __offload_myoLibInit()
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s()\n", __func__);
-    CheckResult("myoiLibInit", myoiLibInit(0, 0));
-}
-
-extern "C" void __offload_myoLibFini()
-{
-    OFFLOAD_DEBUG_TRACE(3, "%s()\n", __func__);
-    myoiLibFini();
-}
diff --git a/offload/src/offload_myo_target.h b/offload/src/offload_myo_target.h
deleted file mode 100644
index 8b7f789d5..000000000
--- a/offload/src/offload_myo_target.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_MYO_TARGET_H_INCLUDED
-#define OFFLOAD_MYO_TARGET_H_INCLUDED
-
-#include <myotypes.h>
-#include <myoimpl.h>
-#include <myo.h>
-#include "offload.h"
-
-typedef MyoiSharedVarEntry          SharedTableEntry;
-typedef MyoiTargetSharedFptrEntry   FptrTableEntry;
-
-#ifdef TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable$a"
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable$z"
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable$a"
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable$z"
-#else  // TARGET_WINNT
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_START          ".MyoSharedTable."
-#define OFFLOAD_MYO_SHARED_TABLE_SECTION_END            ".MyoSharedTable."
-
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_START            ".MyoFptrTable."
-#define OFFLOAD_MYO_FPTR_TABLE_SECTION_END              ".MyoFptrTable."
-#endif // TARGET_WINNT
-
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_SHARED_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_MYO_FPTR_TABLE_SECTION_END, read, write)
-
-extern "C" void __offload_myoRegisterTables(
-    SharedTableEntry *shared_table,
-    FptrTableEntry *fptr_table
-);
-
-extern "C" void __offload_myoAcquire(void);
-extern "C" void __offload_myoRelease(void);
-
-// temporary workaround for blocking behavior for myoiLibInit/Fini calls
-extern "C" void __offload_myoLibInit();
-extern "C" void __offload_myoLibFini();
-
-#endif // OFFLOAD_MYO_TARGET_H_INCLUDED
diff --git a/offload/src/offload_omp_host.cpp b/offload/src/offload_omp_host.cpp
deleted file mode 100644
index edd4445b3..000000000
--- a/offload/src/offload_omp_host.cpp
+++ /dev/null
@@ -1,851 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <omp.h>
-#include "offload.h"
-#include "compiler_if_host.h"
-
-// OpenMP API
-
-void omp_set_default_device(int num)
-{
-    if (num >= 0) {
-        __omp_device_num = num;
-    }
-}
-
-int omp_get_default_device(void)
-{
-    return __omp_device_num;
-}
-
-int omp_get_num_devices()
-{
-    __offload_init_library();
-    return mic_engines_total;
-}
-
-// OpenMP API wrappers
-
-static void omp_set_int_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int setting,
-    const char* f_name
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          f_name, 0);
-    if (ofld) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &setting;
-
-        OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-static int omp_get_int_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    const char * f_name
-)
-{
-    int setting = 0;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          f_name, 0);
-    if (ofld) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &setting;
-
-        OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
-    }
-    return setting;
-}
-
-void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-    omp_set_int_target(target_type, target_number, num_threads,
-                       "omp_set_num_threads_target");
-}
-
-int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_max_threads_target");
-}
-
-int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_num_procs_target");
-}
-
-void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-    omp_set_int_target(target_type, target_number, num_threads,
-                       "omp_set_dynamic_target");
-}
-
-int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_dynamic_target");
-}
-
-void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int nested
-)
-{
-    omp_set_int_target(target_type, target_number, nested,
-                       "omp_set_nested_target");
-}
-
-int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "omp_get_nested_target");
-}
-
-void omp_set_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t kind,
-    int modifier
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(omp_sched_t);
-        vars[0].count = 1;
-        vars[0].ptr = &kind;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_in;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &modifier;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_set_schedule_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_get_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t *kind,
-    int *modifier
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(omp_sched_t);
-        vars[0].count = 1;
-        vars[0].ptr = kind;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = modifier;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_get_schedule_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-}
-
-// lock API functions
-
-void omp_init_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_init_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_destroy_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_destroy_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_set_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_set_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_unset_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_unset_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-int omp_test_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    int result = 0;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_test_lock_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-// nested lock API functions
-
-void omp_init_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_init_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_destroy_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_destroy_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_set_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_set_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void omp_unset_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_unset_nest_lock_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-int omp_test_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    int result = 0;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(omp_nest_lock_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = lock;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "omp_test_nest_lock_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-// kmp API functions
-
-void kmp_set_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int size
-)
-{
-    omp_set_int_target(target_type, target_number, size,
-                       "kmp_set_stacksize_target");
-}
-
-int kmp_get_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_stacksize_target");
-}
-
-void kmp_set_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    size_t size
-)
-{
-    omp_set_int_target(target_type, target_number, size,
-                       "kmp_set_stacksize_s_target");
-}
-
-size_t kmp_get_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_stacksize_s_target");
-}
-
-void kmp_set_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int time
-)
-{
-    omp_set_int_target(target_type, target_number, time,
-                       "kmp_set_blocktime_target");
-}
-
-int kmp_get_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_blocktime_target");
-}
-
-void kmp_set_library_serial_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_serial_target",
-                        0, 0, 0, 0, 0, 0, 0);
-    }
-}
-
-void kmp_set_library_turnaround_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_turnaround_target",
-                        0, 0, 0, 0, 0, 0, 0);
-    }
-}
-
-void kmp_set_library_throughput_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_library_throughput_target",
-                        0, 0, 0, 0, 0, 0, 0);
-    }
-}
-
-void kmp_set_library_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int mode
-)
-{
-    omp_set_int_target(target_type, target_number, mode,
-                       "kmp_set_library_target");
-}
-
-int kmp_get_library_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_library_target");
-}
-
-void kmp_set_defaults_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    char const *defaults
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_string_ptr;
-        vars[0].type.dst = c_string_ptr;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].alloc_if = 1;
-        vars[0].free_if = 1;
-        vars[0].ptr = &defaults;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_defaults_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-// affinity API functions
-
-void kmp_create_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_out;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_create_affinity_mask_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-void kmp_destroy_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[1] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_destroy_affinity_mask_target",
-                        0, 1, vars, NULL, 0, 0, 0);
-    }
-}
-
-int kmp_set_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_affinity_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_get_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[2] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_inout;
-        vars[0].size = sizeof(kmp_affinity_mask_target_t);
-        vars[0].count = 1;
-        vars[0].ptr = mask;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_out;
-        vars[1].size = sizeof(int);
-        vars[1].count = 1;
-        vars[1].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_get_affinity_target",
-                        0, 2, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_get_affinity_max_proc_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return omp_get_int_target(target_type, target_number,
-                              "kmp_get_affinity_max_proc_target");
-}
-
-int kmp_set_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[3] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &proc;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_inout;
-        vars[1].size = sizeof(kmp_affinity_mask_target_t);
-        vars[1].count = 1;
-        vars[1].ptr = mask;
-
-        vars[2].type.src = c_data;
-        vars[2].type.dst = c_data;
-        vars[2].direction.bits = c_parameter_out;
-        vars[2].size = sizeof(int);
-        vars[2].count = 1;
-        vars[2].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_set_affinity_mask_proc_target",
-                        0, 3, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_unset_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[3] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &proc;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_inout;
-        vars[1].size = sizeof(kmp_affinity_mask_target_t);
-        vars[1].count = 1;
-        vars[1].ptr = mask;
-
-        vars[2].type.src = c_data;
-        vars[2].type.dst = c_data;
-        vars[2].direction.bits = c_parameter_out;
-        vars[2].size = sizeof(int);
-        vars[2].count = 1;
-        vars[2].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_unset_affinity_mask_proc_target",
-                        0, 3, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
-
-int kmp_get_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    int result = 1;
-
-    OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
-                                          __func__, 0);
-    if (ofld != 0) {
-        VarDesc vars[3] = {0};
-
-        vars[0].type.src = c_data;
-        vars[0].type.dst = c_data;
-        vars[0].direction.bits = c_parameter_in;
-        vars[0].size = sizeof(int);
-        vars[0].count = 1;
-        vars[0].ptr = &proc;
-
-        vars[1].type.src = c_data;
-        vars[1].type.dst = c_data;
-        vars[1].direction.bits = c_parameter_in;
-        vars[1].size = sizeof(kmp_affinity_mask_target_t);
-        vars[1].count = 1;
-        vars[1].ptr = mask;
-
-        vars[2].type.src = c_data;
-        vars[2].type.dst = c_data;
-        vars[2].direction.bits = c_parameter_out;
-        vars[2].size = sizeof(int);
-        vars[2].count = 1;
-        vars[2].ptr = &result;
-
-        OFFLOAD_OFFLOAD(ofld, "kmp_get_affinity_mask_proc_target",
-                        0, 3, vars, NULL, 0, 0, 0);
-    }
-    return result;
-}
diff --git a/offload/src/offload_omp_target.cpp b/offload/src/offload_omp_target.cpp
deleted file mode 100644
index 1f2052a9d..000000000
--- a/offload/src/offload_omp_target.cpp
+++ /dev/null
@@ -1,1021 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <omp.h>
-#include "offload.h"
-#include "compiler_if_target.h"
-
-// OpenMP API
-
-void omp_set_default_device(int num)
-{
-}
-
-int omp_get_default_device(void)
-{
-    return mic_index;
-}
-
-int omp_get_num_devices()
-{
-    return mic_engines_total;
-}
-
-// OpenMP API wrappers
-
-static void omp_send_int_to_host(
-    void *ofld_,
-    int setting
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &setting;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-static int omp_get_int_from_host(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    int setting;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &setting;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    OFFLOAD_TARGET_LEAVE(ofld);
-
-    return setting;
-}
-
-void omp_set_num_threads_lrb(
-    void *ofld
-)
-{
-    int num_threads;
-
-    num_threads = omp_get_int_from_host(ofld);
-    omp_set_num_threads(num_threads);
-}
-
-void omp_get_max_threads_lrb(
-    void *ofld
-)
-{
-    int num_threads;
-
-    num_threads = omp_get_max_threads();
-    omp_send_int_to_host(ofld, num_threads);
-}
-
-void omp_get_num_procs_lrb(
-    void *ofld
-)
-{
-    int num_procs;
-
-    num_procs = omp_get_num_procs();
-    omp_send_int_to_host(ofld, num_procs);
-}
-
-void omp_set_dynamic_lrb(
-    void *ofld
-)
-{
-    int dynamic;
-
-    dynamic = omp_get_int_from_host(ofld);
-    omp_set_dynamic(dynamic);
-}
-
-void omp_get_dynamic_lrb(
-    void *ofld
-)
-{
-    int dynamic;
-
-    dynamic = omp_get_dynamic();
-    omp_send_int_to_host(ofld, dynamic);
-}
-
-void omp_set_nested_lrb(
-    void *ofld
-)
-{
-    int nested;
-
-    nested = omp_get_int_from_host(ofld);
-    omp_set_nested(nested);
-}
-
-void omp_get_nested_lrb(
-    void *ofld
-)
-{
-    int nested;
-
-    nested = omp_get_nested();
-    omp_send_int_to_host(ofld, nested);
-}
-
-void omp_set_schedule_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_sched_t kind;
-    int modifier;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &kind;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_in;
-    vars[1].ptr = &modifier;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    omp_set_schedule(kind, modifier);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_get_schedule_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_sched_t kind;
-    int modifier;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &kind;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &modifier;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    omp_get_schedule(&kind, &modifier);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// lock API functions
-
-void omp_init_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_init_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_destroy_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_destroy_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_set_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_set_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_unset_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_unset_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_test_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_lock_target_t lock;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = omp_test_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// nested lock API functions
-
-void omp_init_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_init_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_destroy_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_destroy_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_set_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_set_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_unset_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    omp_nest_lock_target_t lock;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    omp_unset_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void omp_test_nest_lock_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    omp_nest_lock_target_t lock;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &lock;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = omp_test_nest_lock(&lock.lock);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// kmp API functions
-
-void kmp_set_stacksize_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = omp_get_int_from_host(ofld);
-    kmp_set_stacksize(size);
-}
-
-void kmp_get_stacksize_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = kmp_get_stacksize();
-    omp_send_int_to_host(ofld, size);
-}
-
-void kmp_set_stacksize_s_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = omp_get_int_from_host(ofld);
-    kmp_set_stacksize_s(size);
-}
-
-void kmp_get_stacksize_s_lrb(
-    void *ofld
-)
-{
-    int size;
-
-    size = kmp_get_stacksize_s();
-    omp_send_int_to_host(ofld, size);
-}
-
-void kmp_set_blocktime_lrb(
-    void *ofld
-)
-{
-    int time;
-
-    time = omp_get_int_from_host(ofld);
-    kmp_set_blocktime(time);
-}
-
-void kmp_get_blocktime_lrb(
-    void *ofld
-)
-{
-    int time;
-
-    time = kmp_get_blocktime();
-    omp_send_int_to_host(ofld, time);
-}
-
-void kmp_set_library_serial_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-
-    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
-    kmp_set_library_serial();
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_library_turnaround_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-
-    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
-    kmp_set_library_turnaround();
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_library_throughput_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-
-    OFFLOAD_TARGET_ENTER(ofld, 0, 0, 0);
-    kmp_set_library_throughput();
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_library_lrb(
-    void *ofld
-)
-{
-    int mode;
-
-    mode = omp_get_int_from_host(ofld);
-    kmp_set_library(mode);
-}
-
-void kmp_get_library_lrb(
-    void *ofld
-)
-{
-    int mode;
-
-    mode = kmp_get_library();
-    omp_send_int_to_host(ofld, mode);
-}
-
-void kmp_set_defaults_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    char *defaults = 0;
-
-    vars[0].type.src = c_string_ptr;
-    vars[0].type.dst = c_string_ptr;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &defaults;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    kmp_set_defaults(defaults);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// affinity API functions
-
-void kmp_create_affinity_mask_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    kmp_affinity_mask_target_t mask;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_out;
-    vars[0].ptr = &mask;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    kmp_create_affinity_mask(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_destroy_affinity_mask_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[1] = {0};
-    kmp_affinity_mask_target_t mask;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &mask;
-
-    OFFLOAD_TARGET_ENTER(ofld, 1, vars, NULL);
-    kmp_destroy_affinity_mask(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_set_affinity_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    kmp_affinity_mask_target_t mask;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &mask;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = kmp_set_affinity(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_get_affinity_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[2] = {0};
-    kmp_affinity_mask_target_t mask;
-    int result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_inout;
-    vars[0].ptr = &mask;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_out;
-    vars[1].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 2, vars, NULL);
-    result = kmp_get_affinity(&mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_get_affinity_max_proc_lrb(
-    void *ofld
-)
-{
-    int max_proc;
-
-    max_proc = kmp_get_affinity_max_proc();
-    omp_send_int_to_host(ofld, max_proc);
-}
-
-void kmp_set_affinity_mask_proc_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[3] = {0};
-    kmp_affinity_mask_target_t mask;
-    int proc, result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &proc;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_inout;
-    vars[1].ptr = &mask;
-
-    vars[2].type.src = c_data;
-    vars[2].type.dst = c_data;
-    vars[2].direction.bits = c_parameter_out;
-    vars[2].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
-    result = kmp_set_affinity_mask_proc(proc, &mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_unset_affinity_mask_proc_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[3] = {0};
-    kmp_affinity_mask_target_t mask;
-    int proc, result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &proc;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_inout;
-    vars[1].ptr = &mask;
-
-    vars[2].type.src = c_data;
-    vars[2].type.dst = c_data;
-    vars[2].direction.bits = c_parameter_out;
-    vars[2].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
-    result = kmp_unset_affinity_mask_proc(proc, &mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-void kmp_get_affinity_mask_proc_lrb(
-    void *ofld_
-)
-{
-    OFFLOAD ofld = (OFFLOAD) ofld_;
-    VarDesc vars[3] = {0};
-    kmp_affinity_mask_target_t mask;
-    int proc, result;
-
-    vars[0].type.src = c_data;
-    vars[0].type.dst = c_data;
-    vars[0].direction.bits = c_parameter_in;
-    vars[0].ptr = &proc;
-
-    vars[1].type.src = c_data;
-    vars[1].type.dst = c_data;
-    vars[1].direction.bits = c_parameter_in;
-    vars[1].ptr = &mask;
-
-    vars[2].type.src = c_data;
-    vars[2].type.dst = c_data;
-    vars[2].direction.bits = c_parameter_out;
-    vars[2].ptr = &result;
-
-    OFFLOAD_TARGET_ENTER(ofld, 3, vars, NULL);
-    result = kmp_get_affinity_mask_proc(proc, &mask.mask);
-    OFFLOAD_TARGET_LEAVE(ofld);
-}
-
-// Target-side stubs for the host functions (to avoid unresolveds)
-// These are needed for the offloadm table
-
-void omp_set_num_threads_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-}
-
-int omp_get_max_threads_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-int omp_get_num_procs_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void omp_set_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-}
-
-int omp_get_dynamic_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void omp_set_nested_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int num_threads
-)
-{
-}
-
-int omp_get_nested_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void omp_set_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t kind,
-    int modifier
-)
-{
-}
-
-void omp_get_schedule_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_sched_t *kind,
-    int *modifier
-)
-{
-}
-
-void omp_init_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-void omp_destroy_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-void omp_set_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-void omp_unset_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-}
-
-int omp_test_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_lock_target_t *lock
-)
-{
-    return 0;
-}
-
-void omp_init_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-void omp_destroy_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-void omp_set_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-void omp_unset_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-}
-
-int omp_test_nest_lock_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    omp_nest_lock_target_t *lock
-)
-{
-    return 0;
-}
-
-void kmp_set_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int size
-)
-{
-}
-
-int kmp_get_stacksize_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    size_t size
-)
-{
-}
-
-size_t kmp_get_stacksize_s_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int time
-)
-{
-}
-
-int kmp_get_blocktime_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_library_serial_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-}
-
-void kmp_set_library_turnaround_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-}
-
-void kmp_set_library_throughput_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-}
-
-void kmp_set_library_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int mode
-)
-{
-}
-
-int kmp_get_library_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-void kmp_set_defaults_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    char const *defaults
-)
-{
-}
-
-void kmp_create_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-}
-
-void kmp_destroy_affinity_mask_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-}
-
-int kmp_set_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_get_affinity_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_get_affinity_max_proc_target(
-    TARGET_TYPE target_type,
-    int target_number
-)
-{
-    return 0;
-}
-
-int kmp_set_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_unset_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
-
-int kmp_get_affinity_mask_proc_target(
-    TARGET_TYPE target_type,
-    int target_number,
-    int proc,
-    kmp_affinity_mask_target_t *mask
-)
-{
-    return 0;
-}
diff --git a/offload/src/offload_orsl.cpp b/offload/src/offload_orsl.cpp
deleted file mode 100644
index 6162f8aa2..000000000
--- a/offload/src/offload_orsl.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_orsl.h"
-#include <stdlib.h>
-#include "offload_host.h"
-#include "orsl-lite/include/orsl-lite.h"
-
-namespace ORSL {
-
-static bool            is_enabled = false;
-static const ORSLTag   my_tag = "Offload";
-
-void init()
-{
-    const char *env_var = getenv("OFFLOAD_ENABLE_ORSL");
-    if (env_var != 0 && *env_var != '\0') {
-        int64_t new_val;
-        if (__offload_parse_int_string(env_var, new_val)) {
-            is_enabled = new_val;
-        }
-        else {
-            LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
-                             "OFFLOAD_ENABLE_ORSL");
-        }
-    }
-
-    if (is_enabled) {
-        OFFLOAD_DEBUG_TRACE(2, "ORSL is enabled\n");
-    }
-    else {
-        OFFLOAD_DEBUG_TRACE(2, "ORSL is disabled\n");
-    }
-}
-
-bool reserve(int device)
-{
-    if (is_enabled) {
-        int pnum = mic_engines[device].get_physical_index();
-        ORSLBusySet bset;
-
-        bset.type = BUSY_SET_FULL;
-        if (ORSLReserve(1, &pnum, &bset, my_tag) != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool try_reserve(int device)
-{
-    if (is_enabled) {
-        int pnum = mic_engines[device].get_physical_index();
-        ORSLBusySet bset;
-
-        bset.type = BUSY_SET_FULL;
-        if (ORSLTryReserve(1, &pnum, &bset, my_tag) != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-void release(int device)
-{
-    if (is_enabled) {
-        int pnum = mic_engines[device].get_physical_index();
-        ORSLBusySet bset;
-
-        bset.type = BUSY_SET_FULL;
-        if (ORSLRelease(1, &pnum, &bset, my_tag) != 0) {
-            // should never get here
-        }
-    }
-}
-
-} // namespace ORSL
diff --git a/offload/src/offload_orsl.h b/offload/src/offload_orsl.h
deleted file mode 100644
index cdb86f913..000000000
--- a/offload/src/offload_orsl.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_ORSL_H_INCLUDED
-#define OFFLOAD_ORSL_H_INCLUDED
-
-// ORSL interface
-namespace ORSL {
-
-extern void init();
-
-extern bool reserve(int device);
-extern bool try_reserve(int device);
-extern void release(int device);
-
-} // namespace ORSL
-
-#endif // OFFLOAD_ORSL_H_INCLUDED
diff --git a/offload/src/offload_table.cpp b/offload/src/offload_table.cpp
deleted file mode 100644
index cf165df9c..000000000
--- a/offload/src/offload_table.cpp
+++ /dev/null
@@ -1,375 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_table.h"
-#include "offload_common.h"
-
-#if !HOST_LIBRARY
-// Predefined offload entries
-extern void omp_set_num_threads_lrb(void*);
-extern void omp_get_max_threads_lrb(void*);
-extern void omp_get_num_procs_lrb(void*);
-extern void omp_set_dynamic_lrb(void*);
-extern void omp_get_dynamic_lrb(void*);
-extern void omp_set_nested_lrb(void*);
-extern void omp_get_nested_lrb(void*);
-extern void omp_set_schedule_lrb(void*);
-extern void omp_get_schedule_lrb(void*);
-
-extern void omp_init_lock_lrb(void*);
-extern void omp_destroy_lock_lrb(void*);
-extern void omp_set_lock_lrb(void*);
-extern void omp_unset_lock_lrb(void*);
-extern void omp_test_lock_lrb(void*);
-
-extern void omp_init_nest_lock_lrb(void*);
-extern void omp_destroy_nest_lock_lrb(void*);
-extern void omp_set_nest_lock_lrb(void*);
-extern void omp_unset_nest_lock_lrb(void*);
-extern void omp_test_nest_lock_lrb(void*);
-
-extern void kmp_set_stacksize_lrb(void*);
-extern void kmp_get_stacksize_lrb(void*);
-extern void kmp_set_stacksize_s_lrb(void*);
-extern void kmp_get_stacksize_s_lrb(void*);
-extern void kmp_set_blocktime_lrb(void*);
-extern void kmp_get_blocktime_lrb(void*);
-extern void kmp_set_library_serial_lrb(void*);
-extern void kmp_set_library_turnaround_lrb(void*);
-extern void kmp_set_library_throughput_lrb(void*);
-extern void kmp_set_library_lrb(void*);
-extern void kmp_get_library_lrb(void*);
-extern void kmp_set_defaults_lrb(void*);
-
-extern void kmp_create_affinity_mask_lrb(void*);
-extern void kmp_destroy_affinity_mask_lrb(void*);
-extern void kmp_set_affinity_lrb(void*);
-extern void kmp_get_affinity_lrb(void*);
-extern void kmp_get_affinity_max_proc_lrb(void*);
-extern void kmp_set_affinity_mask_proc_lrb(void*);
-extern void kmp_unset_affinity_mask_proc_lrb(void*);
-extern void kmp_get_affinity_mask_proc_lrb(void*);
-
-// Predefined entries on the target side
-static FuncTable::Entry predefined_entries[] = {
-    "omp_set_num_threads_target",
-    (void*) &omp_set_num_threads_lrb,
-    "omp_get_max_threads_target",
-    (void*) &omp_get_max_threads_lrb,
-    "omp_get_num_procs_target",
-    (void*) &omp_get_num_procs_lrb,
-    "omp_set_dynamic_target",
-    (void*) &omp_set_dynamic_lrb,
-    "omp_get_dynamic_target",
-    (void*) &omp_get_dynamic_lrb,
-    "omp_set_nested_target",
-    (void*) &omp_set_nested_lrb,
-    "omp_get_nested_target",
-    (void*) &omp_get_nested_lrb,
-    "omp_set_schedule_target",
-    (void*) &omp_set_schedule_lrb,
-    "omp_get_schedule_target",
-    (void*) &omp_get_schedule_lrb,
-
-    "omp_init_lock_target",
-    (void*) &omp_init_lock_lrb,
-    "omp_destroy_lock_target",
-    (void*) &omp_destroy_lock_lrb,
-    "omp_set_lock_target",
-    (void*) &omp_set_lock_lrb,
-    "omp_unset_lock_target",
-    (void*) &omp_unset_lock_lrb,
-    "omp_test_lock_target",
-    (void*) &omp_test_lock_lrb,
-
-    "omp_init_nest_lock_target",
-    (void*) &omp_init_nest_lock_lrb,
-    "omp_destroy_nest_lock_target",
-    (void*) &omp_destroy_nest_lock_lrb,
-    "omp_set_nest_lock_target",
-    (void*) &omp_set_nest_lock_lrb,
-    "omp_unset_nest_lock_target",
-    (void*) &omp_unset_nest_lock_lrb,
-    "omp_test_nest_lock_target",
-    (void*) &omp_test_nest_lock_lrb,
-
-    "kmp_set_stacksize_target",
-    (void*) &kmp_set_stacksize_lrb,
-    "kmp_get_stacksize_target",
-    (void*) &kmp_get_stacksize_lrb,
-    "kmp_set_stacksize_s_target",
-    (void*) &kmp_set_stacksize_s_lrb,
-    "kmp_get_stacksize_s_target",
-    (void*) &kmp_get_stacksize_s_lrb,
-    "kmp_set_blocktime_target",
-    (void*) &kmp_set_blocktime_lrb,
-    "kmp_get_blocktime_target",
-    (void*) &kmp_get_blocktime_lrb,
-    "kmp_set_library_serial_target",
-    (void*) &kmp_set_library_serial_lrb,
-    "kmp_set_library_turnaround_target",
-    (void*) &kmp_set_library_turnaround_lrb,
-    "kmp_set_library_throughput_target",
-    (void*) &kmp_set_library_throughput_lrb,
-    "kmp_set_library_target",
-    (void*) &kmp_set_library_lrb,
-    "kmp_get_library_target",
-    (void*) &kmp_get_library_lrb,
-    "kmp_set_defaults_target",
-    (void*) &kmp_set_defaults_lrb,
-
-    "kmp_create_affinity_mask_target",
-    (void*) &kmp_create_affinity_mask_lrb,
-    "kmp_destroy_affinity_mask_target",
-    (void*) &kmp_destroy_affinity_mask_lrb,
-    "kmp_set_affinity_target",
-    (void*) &kmp_set_affinity_lrb,
-    "kmp_get_affinity_target",
-    (void*) &kmp_get_affinity_lrb,
-    "kmp_get_affinity_max_proc_target",
-    (void*) &kmp_get_affinity_max_proc_lrb,
-    "kmp_set_affinity_mask_proc_target",
-    (void*) &kmp_set_affinity_mask_proc_lrb,
-    "kmp_unset_affinity_mask_proc_target",
-    (void*) &kmp_unset_affinity_mask_proc_lrb,
-    "kmp_get_affinity_mask_proc_target",
-    (void*) &kmp_get_affinity_mask_proc_lrb,
-
-    (const char*) -1,
-    (void*) -1
-};
-
-static FuncList::Node predefined_table = {
-    { predefined_entries, -1 },
-    0, 0
-};
-
-// Entry table
-FuncList __offload_entries(&predefined_table);
-#else
-FuncList __offload_entries;
-#endif // !HOST_LIBRARY
-
-// Function table. No predefined entries.
-FuncList __offload_funcs;
-
-// Var table
-VarList  __offload_vars;
-
-// Given the function name returns the associtated function pointer
-const void* FuncList::find_addr(const char *name)
-{
-    const void* func = 0;
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0 && strcmp(e->name, name) == 0) {
-                func = e->func;
-                break;
-            }
-        }
-    }
-
-    m_lock.unlock();
-
-    return func;
-}
-
-// Given the function pointer returns the associtated function name
-const char* FuncList::find_name(const void *func)
-{
-    const char* name = 0;
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->func == func) {
-                name = e->name;
-                break;
-            }
-        }
-    }
-
-    m_lock.unlock();
-
-    return name;
-}
-
-// Returns max name length from all tables
-int64_t FuncList::max_name_length(void)
-{
-    if (m_max_name_len < 0) {
-        m_lock.lock();
-
-        m_max_name_len = 0;
-        for (Node *n = m_head; n != 0; n = n->next) {
-            if (n->table.max_name_len < 0) {
-                n->table.max_name_len = 0;
-
-                // calculate max name length in a single table
-                for (const Table::Entry *e = n->table.entries;
-                     e->name != (const char*) -1; e++) {
-                    if (e->name != 0) {
-                        size_t len = strlen(e->name) + 1;
-                        if (n->table.max_name_len < len) {
-                            n->table.max_name_len = len;
-                        }
-                    }
-                }
-            }
-
-            // select max from all tables
-            if (m_max_name_len < n->table.max_name_len) {
-                m_max_name_len = n->table.max_name_len;
-            }
-        }
-
-        m_lock.unlock();
-    }
-    return m_max_name_len;
-}
-
-// Debugging dump
-void FuncList::dump(void)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Function table:\n");
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-                OFFLOAD_DEBUG_TRACE(2, "%p %s\n", e->func, e->name);
-            }
-        }
-    }
-
-    m_lock.unlock();
-}
-
-// Debugging dump
-void VarList::dump(void)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Var table:\n");
-
-    m_lock.lock();
-
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-#if HOST_LIBRARY
-                OFFLOAD_DEBUG_TRACE(2, "%s %p %ld\n", e->name, e->addr,
-                                    e->size);
-#else  // HOST_LIBRARY
-                OFFLOAD_DEBUG_TRACE(2, "%s %p\n", e->name, e->addr);
-#endif // HOST_LIBRARY
-            }
-        }
-    }
-
-    m_lock.unlock();
-}
-
-//
-int64_t VarList::table_size(int64_t &nelems)
-{
-    int64_t length = 0;
-
-    nelems = 0;
-
-    // calculate string table size and number of elements
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-                length += strlen(e->name) + 1;
-                nelems++;
-            }
-        }
-    }
-
-    return nelems * sizeof(BufEntry) + length;
-}
-
-// copy table to the gven buffer
-void VarList::table_copy(void *buf, int64_t nelems)
-{
-    BufEntry* elems = static_cast<BufEntry*>(buf);
-    char*     names = reinterpret_cast<char*>(elems + nelems);
-
-    // copy entries to buffer
-    for (Node *n = m_head; n != 0; n = n->next) {
-        for (const Table::Entry *e = n->table.entries;
-             e->name != (const char*) -1; e++) {
-            if (e->name != 0) {
-                // name field contains offset to the name from the beginning
-                // of the buffer
-                elems->name = names - static_cast<char*>(buf);
-                elems->addr = reinterpret_cast<intptr_t>(e->addr);
-
-                // copy name to string table
-                const char *name = e->name;
-                while ((*names++ = *name++) != '\0');
-
-                elems++;
-            }
-        }
-    }
-}
-
-// patch name offsets in a buffer
-void VarList::table_patch_names(void *buf, int64_t nelems)
-{
-    BufEntry* elems = static_cast<BufEntry*>(buf);
-    for (int i = 0; i < nelems; i++) {
-        elems[i].name += reinterpret_cast<intptr_t>(buf);
-    }
-}
-
-// Adds given list element to the global lookup table list
-extern "C" void __offload_register_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-)
-{
-    OFFLOAD_DEBUG_TRACE(2, "Registering offload function entry table %p\n",
-                           entry_table);
-    __offload_entries.add_table(entry_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Registering function table %p\n", func_table);
-    __offload_funcs.add_table(func_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Registering var table %p\n", var_table);
-    __offload_vars.add_table(var_table);
-}
-
-// Removes given list element from the global lookup table list
-extern "C" void __offload_unregister_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-)
-{
-    __offload_entries.remove_table(entry_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Unregistering function table %p\n", func_table);
-    __offload_funcs.remove_table(func_table);
-
-    OFFLOAD_DEBUG_TRACE(2, "Unregistering var table %p\n", var_table);
-    __offload_vars.remove_table(var_table);
-}
diff --git a/offload/src/offload_table.h b/offload/src/offload_table.h
deleted file mode 100644
index cfced3e56..000000000
--- a/offload/src/offload_table.h
+++ /dev/null
@@ -1,301 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-/*! \file
-    \brief Function and Variable tables used by the runtime library
-*/
-
-#ifndef OFFLOAD_TABLE_H_INCLUDED
-#define OFFLOAD_TABLE_H_INCLUDED
-
-#include <iterator>
-#include "offload_util.h"
-
-// Template representing double linked list of tables
-template <typename T> class TableList {
-public:
-    // table type
-    typedef T Table;
-
-    // List node
-    struct Node {
-        Table   table;
-        Node*   prev;
-        Node*   next;
-    };
-
-public:
-    explicit TableList(Node *node = 0) : m_head(node) {}
-
-    void add_table(Node *node) {
-        m_lock.lock();
-
-        if (m_head != 0) {
-            node->next = m_head;
-            m_head->prev = node;
-        }
-        m_head = node;
-
-        m_lock.unlock();
-    }
-
-    void remove_table(Node *node) {
-        m_lock.lock();
-
-        if (node->next != 0) {
-            node->next->prev = node->prev;
-        }
-        if (node->prev != 0) {
-            node->prev->next = node->next;
-        }
-        if (m_head == node) {
-            m_head = node->next;
-        }
-
-        m_lock.unlock();
-    }
-
-protected:
-    Node*           m_head;
-    mutex_t         m_lock;
-};
-
-// Function lookup table.
-struct FuncTable {
-    //! Function table entry
-    /*! This table contains functions created from offload regions.   */
-    /*! Each entry consists of a pointer to the function's "key"
-        and the function address.                                     */
-    /*! Each shared library or executable may contain one such table. */
-    /*! The end of the table is marked with an entry whose name field
-        has value -1.                                                 */
-    struct Entry {
-        const char* name; //!< Name of the function
-        void*       func; //!< Address of the function
-    };
-
-    // entries
-    const Entry *entries;
-
-    // max name length
-    int64_t max_name_len;
-};
-
-// Function table
-class FuncList : public TableList<FuncTable> {
-public:
-    explicit FuncList(Node *node = 0) : TableList<Table>(node),
-                                        m_max_name_len(-1)
-    {}
-
-    // add table to the list
-    void add_table(Node *node) {
-        // recalculate max function name length
-        m_max_name_len = -1;
-
-        // add table
-        TableList<Table>::add_table(node);
-    }
-
-    // find function address for the given name
-    const void* find_addr(const char *name);
-
-    // find function name for the given address
-    const char* find_name(const void *addr);
-
-    // max name length from all tables in the list
-    int64_t max_name_length(void);
-
-    // debug dump
-    void dump(void);
-
-private:
-    // max name length within from all tables
-    int64_t m_max_name_len;
-};
-
-// Table entry for static variables
-struct VarTable {
-    //! Variable table entry
-    /*! This table contains statically allocated variables marked with
-        __declspec(target(mic) or #pragma omp declare target.           */
-    /*! Each entry consists of a pointer to the variable's "key",
-        the variable address and its size in bytes.                     */
-    /*! Because memory allocation is done from the host,
-        the MIC table does not need the size of the variable.           */
-    /*! Padding to make the table entry size a power of 2 is necessary
-        to avoid "holes" between table contributions from different object
-        files on Windows when debug information is specified with /Zi.  */
-    struct Entry {
-        const char* name; //!< Name of the variable
-        void*       addr; //!< Address of the variable
-
-#if HOST_LIBRARY
-        uint64_t    size;
-
-#ifdef TARGET_WINNT
-		// padding to make entry size a power of 2
-        uint64_t    padding;
-#endif // TARGET_WINNT
-#endif
-    };
-
-    // Table terminated by an entry with name == -1
-    const Entry *entries;
-};
-
-// List of var tables
-class VarList : public TableList<VarTable> {
-public:
-    VarList() : TableList<Table>()
-    {}
-
-    // debug dump
-    void dump();
-
-public:
-    // var table list iterator
-    class Iterator : public std::iterator<std::input_iterator_tag,
-                                          Table::Entry> {
-    public:
-        Iterator() : m_node(0), m_entry(0) {}
-
-        explicit Iterator(Node *node) {
-            new_node(node);
-        }
-
-        Iterator& operator++() {
-            if (m_entry != 0) {
-                m_entry++;
-                while (m_entry->name == 0) {
-                    m_entry++;
-                }
-                if (m_entry->name == reinterpret_cast<const char*>(-1)) {
-                    new_node(m_node->next);
-                }
-            }
-            return *this;
-        }
-
-        bool operator==(const Iterator &other) const {
-            return m_entry == other.m_entry;
-        }
-
-        bool operator!=(const Iterator &other) const {
-            return m_entry != other.m_entry;
-        }
-
-        const Table::Entry* operator*() const {
-            return m_entry;
-        }
-
-    private:
-        void new_node(Node *node) {
-            m_node = node;
-            m_entry = 0;
-            while (m_node != 0) {
-                m_entry = m_node->table.entries;
-                while (m_entry->name == 0) {
-                    m_entry++;
-                }
-                if (m_entry->name != reinterpret_cast<const char*>(-1)) {
-                    break;
-                }
-                m_node = m_node->next;
-                m_entry = 0;
-            }
-        }
-
-    private:
-        Node                *m_node;
-        const Table::Entry  *m_entry;
-    };
-
-    Iterator begin() const {
-        return Iterator(m_head);
-    }
-
-    Iterator end() const {
-        return Iterator();
-    }
-
-public:
-    // Entry representation in a copy buffer
-    struct BufEntry {
-        intptr_t name;
-        intptr_t addr;
-    };
-
-    // Calculate the number of elements in the table and
-    // returns the size of buffer for the table
-    int64_t table_size(int64_t &nelems);
-
-    // Copy table contents to given buffer. It is supposed to be large
-    // enough to hold all elements as string table.
-    void table_copy(void *buf, int64_t nelems);
-
-    // Patch name offsets in a table after it's been copied to other side
-    static void table_patch_names(void *buf, int64_t nelems);
-};
-
-extern FuncList __offload_entries;
-extern FuncList __offload_funcs;
-extern VarList  __offload_vars;
-
-// Section names where the lookup tables are stored
-#ifdef TARGET_WINNT
-#define OFFLOAD_ENTRY_TABLE_SECTION_START   ".OffloadEntryTable$a"
-#define OFFLOAD_ENTRY_TABLE_SECTION_END     ".OffloadEntryTable$z"
-
-#define OFFLOAD_FUNC_TABLE_SECTION_START    ".OffloadFuncTable$a"
-#define OFFLOAD_FUNC_TABLE_SECTION_END      ".OffloadFuncTable$z"
-
-#define OFFLOAD_VAR_TABLE_SECTION_START     ".OffloadVarTable$a"
-#define OFFLOAD_VAR_TABLE_SECTION_END       ".OffloadVarTable$z"
-
-#define OFFLOAD_CRTINIT_SECTION_START       ".CRT$XCT"
-
-#pragma section(OFFLOAD_CRTINIT_SECTION_START, read)
-
-#else  // TARGET_WINNT
-
-#define OFFLOAD_ENTRY_TABLE_SECTION_START   ".OffloadEntryTable."
-#define OFFLOAD_ENTRY_TABLE_SECTION_END     ".OffloadEntryTable."
-
-#define OFFLOAD_FUNC_TABLE_SECTION_START    ".OffloadFuncTable."
-#define OFFLOAD_FUNC_TABLE_SECTION_END      ".OffloadFuncTable."
-
-#define OFFLOAD_VAR_TABLE_SECTION_START     ".OffloadVarTable."
-#define OFFLOAD_VAR_TABLE_SECTION_END       ".OffloadVarTable."
-#endif // TARGET_WINNT
-
-#pragma section(OFFLOAD_ENTRY_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_ENTRY_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_FUNC_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_FUNC_TABLE_SECTION_END, read, write)
-
-#pragma section(OFFLOAD_VAR_TABLE_SECTION_START, read, write)
-#pragma section(OFFLOAD_VAR_TABLE_SECTION_END, read, write)
-
-
-// register/unregister given tables
-extern "C" void __offload_register_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-);
-
-extern "C" void __offload_unregister_tables(
-    FuncList::Node *entry_table,
-    FuncList::Node *func_table,
-    VarList::Node *var_table
-);
-#endif  // OFFLOAD_TABLE_H_INCLUDED
diff --git a/offload/src/offload_target.cpp b/offload/src/offload_target.cpp
deleted file mode 100644
index cfc1b0409..000000000
--- a/offload/src/offload_target.cpp
+++ /dev/null
@@ -1,754 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_target.h"
-#include <stdlib.h>
-#include <unistd.h>
-#ifdef SEP_SUPPORT
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#endif // SEP_SUPPORT
-#include <omp.h>
-#include <map>
-
-// typedef offload_func_with_parms.
-// Pointer to function that represents an offloaded entry point.
-// The parameters are a temporary fix for parameters on the stack.
-typedef void (*offload_func_with_parms)(void *);
-
-// Target console and file logging
-const char *prefix;
-int console_enabled = 0;
-int offload_report_level = 0;
-
-// Trace information
-static const char* vardesc_direction_as_string[] = {
-    "NOCOPY",
-    "IN",
-    "OUT",
-    "INOUT"
-};
-static const char* vardesc_type_as_string[] = {
-    "unknown",
-    "data",
-    "data_ptr",
-    "func_ptr",
-    "void_ptr",
-    "string_ptr",
-    "dv",
-    "dv_data",
-    "dv_data_slice",
-    "dv_ptr",
-    "dv_ptr_data",
-    "dv_ptr_data_slice",
-    "cean_var",
-    "cean_var_ptr",
-    "c_data_ptr_array"
-};
-
-int mic_index = -1;
-int mic_engines_total = -1;
-uint64_t mic_frequency = 0;
-int offload_number = 0;
-static std::map<void*, RefInfo*> ref_data;
-static mutex_t add_ref_lock;
-
-#ifdef SEP_SUPPORT
-static const char*  sep_monitor_env = "SEP_MONITOR";
-static bool         sep_monitor = false;
-static const char*  sep_device_env = "SEP_DEVICE";
-static const char*  sep_device =  "/dev/sep3.8/c";
-static int          sep_counter = 0;
-
-#define SEP_API_IOC_MAGIC   99
-#define SEP_IOCTL_PAUSE     _IO (SEP_API_IOC_MAGIC, 31)
-#define SEP_IOCTL_RESUME    _IO (SEP_API_IOC_MAGIC, 32)
-
-static void add_ref_count(void * buf, bool created)
-{
-    mutex_locker_t locker(add_ref_lock);
-    RefInfo * info = ref_data[buf];
-
-    if (info) {
-        info->count++;
-    }
-    else {
-        info = new RefInfo((int)created,(long)1);
-    }
-    info->is_added |= created;
-    ref_data[buf] = info;
-}
-
-static void BufReleaseRef(void * buf)
-{
-    mutex_locker_t locker(add_ref_lock);
-    RefInfo * info = ref_data[buf];
-
-    if (info) {
-        --info->count;
-        if (info->count == 0 && info->is_added) {
-            BufferReleaseRef(buf);
-            info->is_added = 0;
-        }
-    }
-}
-
-static int VTPauseSampling(void)
-{
-    int ret = -1;
-    int handle = open(sep_device, O_RDWR);
-    if (handle > 0) {
-        ret = ioctl(handle, SEP_IOCTL_PAUSE);
-        close(handle);
-    }
-    return ret;
-}
-
-static int VTResumeSampling(void)
-{
-    int ret = -1;
-    int handle = open(sep_device, O_RDWR);
-    if (handle > 0) {
-        ret = ioctl(handle, SEP_IOCTL_RESUME);
-        close(handle);
-    }
-    return ret;
-}
-#endif // SEP_SUPPORT
-
-void OffloadDescriptor::offload(
-    uint32_t  buffer_count,
-    void**    buffers,
-    void*     misc_data,
-    uint16_t  misc_data_len,
-    void*     return_data,
-    uint16_t  return_data_len
-)
-{
-    FunctionDescriptor *func = (FunctionDescriptor*) misc_data;
-    const char *name = func->data;
-    OffloadDescriptor ofld;
-    char *in_data = 0;
-    char *out_data = 0;
-    char *timer_data = 0;
-
-    console_enabled = func->console_enabled;
-    timer_enabled = func->timer_enabled;
-    offload_report_level = func->offload_report_level;
-    offload_number = func->offload_number;
-    ofld.set_offload_number(func->offload_number);
-
-#ifdef SEP_SUPPORT
-    if (sep_monitor) {
-        if (__sync_fetch_and_add(&sep_counter, 1) == 0) {
-            OFFLOAD_DEBUG_TRACE(2, "VTResumeSampling\n");
-            VTResumeSampling();
-        }
-    }
-#endif // SEP_SUPPORT
-
-    OFFLOAD_DEBUG_TRACE_1(2, ofld.get_offload_number(),
-                          c_offload_start_target_func,
-                          "Offload \"%s\" started\n", name);
-
-    // initialize timer data
-    OFFLOAD_TIMER_INIT();
-
-    OFFLOAD_TIMER_START(c_offload_target_total_time);
-
-    OFFLOAD_TIMER_START(c_offload_target_descriptor_setup);
-
-    // get input/output buffer addresses
-    if (func->in_datalen > 0 || func->out_datalen > 0) {
-        if (func->data_offset != 0) {
-            in_data = (char*) misc_data + func->data_offset;
-            out_data = (char*) return_data;
-        }
-        else {
-            char *inout_buf = (char*) buffers[--buffer_count];
-            in_data = inout_buf;
-            out_data = inout_buf;
-        }
-    }
-
-    // assign variable descriptors
-    ofld.m_vars_total = func->vars_num;
-    if (ofld.m_vars_total > 0) {
-        uint64_t var_data_len = ofld.m_vars_total * sizeof(VarDesc);
-
-        ofld.m_vars = (VarDesc*) malloc(var_data_len);
-        memcpy(ofld.m_vars, in_data, var_data_len);
-
-        in_data += var_data_len;
-        func->in_datalen -= var_data_len;
-    }
-
-    // timer data
-    if (func->timer_enabled) {
-        uint64_t timer_data_len = OFFLOAD_TIMER_DATALEN();
-
-        timer_data = out_data;
-        out_data += timer_data_len;
-        func->out_datalen -= timer_data_len;
-    }
-
-    // init Marshallers
-    ofld.m_in.init_buffer(in_data, func->in_datalen);
-    ofld.m_out.init_buffer(out_data, func->out_datalen);
-
-    // copy buffers to offload descriptor
-    std::copy(buffers, buffers + buffer_count,
-              std::back_inserter(ofld.m_buffers));
-
-    OFFLOAD_TIMER_STOP(c_offload_target_descriptor_setup);
-
-    // find offload entry address
-    OFFLOAD_TIMER_START(c_offload_target_func_lookup);
-
-    offload_func_with_parms entry = (offload_func_with_parms)
-        __offload_entries.find_addr(name);
-
-    if (entry == NULL) {
-#if OFFLOAD_DEBUG > 0
-        if (console_enabled > 2) {
-            __offload_entries.dump();
-        }
-#endif
-        LIBOFFLOAD_ERROR(c_offload_descriptor_offload, name);
-        exit(1);
-    }
-
-    OFFLOAD_TIMER_STOP(c_offload_target_func_lookup);
-
-    OFFLOAD_TIMER_START(c_offload_target_func_time);
-
-    // execute offload entry
-    entry(&ofld);
-
-    OFFLOAD_TIMER_STOP(c_offload_target_func_time);
-
-    OFFLOAD_TIMER_STOP(c_offload_target_total_time);
-
-    // copy timer data to the buffer
-    OFFLOAD_TIMER_TARGET_DATA(timer_data);
-
-    OFFLOAD_DEBUG_TRACE(2, "Offload \"%s\" finished\n", name);
-
-#ifdef SEP_SUPPORT
-    if (sep_monitor) {
-        if (__sync_sub_and_fetch(&sep_counter, 1) == 0) {
-            OFFLOAD_DEBUG_TRACE(2, "VTPauseSampling\n");
-            VTPauseSampling();
-        }
-    }
-#endif // SEP_SUPPORT
-}
-
-void OffloadDescriptor::merge_var_descs(
-    VarDesc *vars,
-    VarDesc2 *vars2,
-    int vars_total
-)
-{
-    // number of variable descriptors received from host and generated
-    // locally should match
-    if (m_vars_total < vars_total) {
-        LIBOFFLOAD_ERROR(c_merge_var_descs1);
-        exit(1);
-    }
-
-    for (int i = 0; i < m_vars_total; i++) {
-        if (i < vars_total) {
-            // variable type must match
-            if (m_vars[i].type.bits != vars[i].type.bits) {
-                LIBOFFLOAD_ERROR(c_merge_var_descs2);
-                exit(1);
-            }
-
-            m_vars[i].ptr = vars[i].ptr;
-            m_vars[i].into = vars[i].into;
-
-            const char *var_sname = "";
-            if (vars2 != NULL) {
-                if (vars2[i].sname != NULL) {
-                    var_sname = vars2[i].sname;
-                }
-            }
-            OFFLOAD_DEBUG_TRACE_1(2, get_offload_number(), c_offload_var,
-                "   VarDesc %d, var=%s, %s, %s\n",
-                i, var_sname,
-                vardesc_direction_as_string[m_vars[i].direction.bits],
-                vardesc_type_as_string[m_vars[i].type.src]);
-            if (vars2 != NULL && vars2[i].dname != NULL) {
-                OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
-                    vardesc_type_as_string[m_vars[i].type.dst]);
-            }
-        }
-        OFFLOAD_TRACE(2,
-            "              type_src=%d, type_dstn=%d, direction=%d, "
-            "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
-            "offset=%lld, size=%lld, count/disp=%lld, ptr=%p into=%p\n",
-            m_vars[i].type.src,
-            m_vars[i].type.dst,
-            m_vars[i].direction.bits,
-            m_vars[i].alloc_if,
-            m_vars[i].free_if,
-            m_vars[i].align,
-            m_vars[i].mic_offset,
-            m_vars[i].flags.bits,
-            m_vars[i].offset,
-            m_vars[i].size,
-            m_vars[i].count,
-            m_vars[i].ptr,
-            m_vars[i].into);
-    }
-}
-
-void OffloadDescriptor::scatter_copyin_data()
-{
-    OFFLOAD_TIMER_START(c_offload_target_scatter_inputs);
-
-    OFFLOAD_DEBUG_TRACE(2, "IN  buffer @ %p size %lld\n",
-                        m_in.get_buffer_start(),
-                        m_in.get_buffer_size());
-    OFFLOAD_DEBUG_DUMP_BYTES(2, m_in.get_buffer_start(),
-                             m_in.get_buffer_size());
-
-    // receive data
-    for (int i = 0; i < m_vars_total; i++) {
-        bool src_is_for_mic = (m_vars[i].direction.out ||
-                               m_vars[i].into == NULL);
-        void** ptr_addr = src_is_for_mic ?
-                          static_cast<void**>(m_vars[i].ptr) :
-                          static_cast<void**>(m_vars[i].into);
-        int type = src_is_for_mic ? m_vars[i].type.src :
-                                    m_vars[i].type.dst;
-        bool is_static = src_is_for_mic ?
-                         m_vars[i].flags.is_static :
-                         m_vars[i].flags.is_static_dstn;
-        void *ptr = NULL;
-
-        if (m_vars[i].flags.alloc_disp) {
-            int64_t offset = 0;
-            m_in.receive_data(&offset, sizeof(offset));
-            m_vars[i].offset = -offset;
-        }
-        if (VAR_TYPE_IS_DV_DATA_SLICE(type) ||
-            VAR_TYPE_IS_DV_DATA(type)) {
-            ArrDesc *dvp = (type == c_dv_data_slice || type == c_dv_data)?
-                  reinterpret_cast<ArrDesc*>(ptr_addr) :
-                  *reinterpret_cast<ArrDesc**>(ptr_addr);
-            ptr_addr = reinterpret_cast<void**>(&dvp->Base);
-        }
-
-        // Set pointer values
-        switch (type) {
-            case c_data_ptr_array:
-                {
-                    int j = m_vars[i].ptr_arr_offset;
-                    int max_el = j + m_vars[i].count;
-                    char *dst_arr_ptr = (src_is_for_mic)?
-                        *(reinterpret_cast<char**>(m_vars[i].ptr)) :
-                        reinterpret_cast<char*>(m_vars[i].into);
-
-                    for (; j < max_el; j++) {
-                        if (src_is_for_mic) {
-                            m_vars[j].ptr =
-                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
-                        }
-                        else {
-                            m_vars[j].into =
-                                dst_arr_ptr + m_vars[j].ptr_arr_offset;
-                        }
-                    }
-                }
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-            case c_dv:
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].alloc_if) {
-                    void *buf;
-                    if (m_vars[i].flags.sink_addr) {
-                        m_in.receive_data(&buf, sizeof(buf));
-                    }
-                    else {
-                        buf = m_buffers.front();
-                        m_buffers.pop_front();
-                    }
-                    if (buf) {
-                        if (!is_static) {
-                            if (!m_vars[i].flags.sink_addr) {
-                                // increment buffer reference
-                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
-                                BufferAddRef(buf);
-                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
-                            }
-                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
-                        }
-                        ptr = static_cast<char*>(buf) +
-                                  m_vars[i].mic_offset +
-                                  (m_vars[i].flags.is_stack_buf ?
-                                   0 : m_vars[i].offset);
-                    }
-                    *ptr_addr = ptr;
-                }
-                else if (m_vars[i].flags.sink_addr) {
-                    void *buf;
-                    m_in.receive_data(&buf, sizeof(buf));
-                    void *ptr = static_cast<char*>(buf) +
-                                    m_vars[i].mic_offset +
-                                    (m_vars[i].flags.is_stack_buf ?
-                                     0 : m_vars[i].offset);
-                    *ptr_addr = ptr;
-                }
-                break;
-
-            case c_func_ptr:
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (m_vars[i].alloc_if) {
-                    void *buf;
-                    if (m_vars[i].flags.sink_addr) {
-                        m_in.receive_data(&buf, sizeof(buf));
-                    }
-                    else {
-                        buf = m_buffers.front();
-                        m_buffers.pop_front();
-                    }
-                    if (buf) {
-                        if (!is_static) {
-                            if (!m_vars[i].flags.sink_addr) {
-                                // increment buffer reference
-                                OFFLOAD_TIMER_START(c_offload_target_add_buffer_refs);
-                                BufferAddRef(buf);
-                                OFFLOAD_TIMER_STOP(c_offload_target_add_buffer_refs);
-                            }
-                            add_ref_count(buf, 0 == m_vars[i].flags.sink_addr);
-                        }
-                        ptr = static_cast<char*>(buf) +
-                            m_vars[i].mic_offset + m_vars[i].offset;
-                    }
-                    *ptr_addr = ptr;
-                }
-                else if (m_vars[i].flags.sink_addr) {
-                    void *buf;
-                    m_in.receive_data(&buf, sizeof(buf));
-                    ptr = static_cast<char*>(buf) +
-                          m_vars[i].mic_offset + m_vars[i].offset;
-                    *ptr_addr = ptr;
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, type);
-                abort();
-        }
-        // Release obsolete buffers for stack of persistent objects
-        if (type = c_data_ptr &&
-            m_vars[i].flags.is_stack_buf &&
-            !m_vars[i].direction.bits &&
-            m_vars[i].alloc_if &&
-            m_vars[i].size != 0) {
-                for (int j=0; j < m_vars[i].size; j++) {
-                    void *buf;
-                    m_in.receive_data(&buf, sizeof(buf));
-                    BufferReleaseRef(buf);
-                    ref_data.erase(buf);
-                }
-        }
-        // Do copyin
-        switch (m_vars[i].type.dst) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.in &&
-                    !m_vars[i].flags.is_static_dstn) {
-                    int64_t size;
-                    int64_t disp;
-                    char* ptr = m_vars[i].into ?
-                                 static_cast<char*>(m_vars[i].into) :
-                                 static_cast<char*>(m_vars[i].ptr);
-                    if (m_vars[i].type.dst == c_cean_var) {
-                        m_in.receive_data((&size), sizeof(int64_t));
-                        m_in.receive_data((&disp), sizeof(int64_t));
-                    }
-                    else {
-                        size = m_vars[i].size;
-                        disp = 0;
-                    }
-                    m_in.receive_data(ptr + disp, size);
-                }
-                break;
-
-            case c_dv:
-                if (m_vars[i].direction.bits ||
-                    m_vars[i].alloc_if ||
-                    m_vars[i].free_if) {
-                    char* ptr = m_vars[i].into ?
-                                 static_cast<char*>(m_vars[i].into) :
-                                 static_cast<char*>(m_vars[i].ptr);
-                    m_in.receive_data(ptr + sizeof(uint64_t),
-                                      m_vars[i].size - sizeof(uint64_t));
-                }
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                break;
-
-            case c_func_ptr:
-                if (m_vars[i].direction.in) {
-                    m_in.receive_func_ptr((const void**) m_vars[i].ptr);
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
-                abort();
-        }
-    }
-
-    OFFLOAD_TRACE(1, "Total copyin data received from host: [%lld] bytes\n",
-                  m_in.get_tfr_size());
-
-    OFFLOAD_TIMER_STOP(c_offload_target_scatter_inputs);
-
-    OFFLOAD_TIMER_START(c_offload_target_compute);
-}
-
-void OffloadDescriptor::gather_copyout_data()
-{
-    OFFLOAD_TIMER_STOP(c_offload_target_compute);
-
-    OFFLOAD_TIMER_START(c_offload_target_gather_outputs);
-
-    for (int i = 0; i < m_vars_total; i++) {
-        bool src_is_for_mic = (m_vars[i].direction.out ||
-                               m_vars[i].into == NULL);
-
-        switch (m_vars[i].type.src) {
-            case c_data_ptr_array:
-                break;
-            case c_data:
-            case c_void_ptr:
-            case c_cean_var:
-                if (m_vars[i].direction.out &&
-                    !m_vars[i].flags.is_static) {
-                    m_out.send_data(
-                        static_cast<char*>(m_vars[i].ptr) + m_vars[i].disp,
-                        m_vars[i].size);
-                }
-                break;
-
-            case c_dv:
-                break;
-
-            case c_string_ptr:
-            case c_data_ptr:
-            case c_cean_var_ptr:
-            case c_dv_ptr:
-                if (m_vars[i].free_if &&
-                    src_is_for_mic &&
-                    !m_vars[i].flags.is_static) {
-                    void *buf = *static_cast<char**>(m_vars[i].ptr) -
-                                    m_vars[i].mic_offset -
-                                    (m_vars[i].flags.is_stack_buf?
-                                     0 : m_vars[i].offset);
-                    if (buf == NULL) {
-                        break;
-                    }
-                    // decrement buffer reference count
-                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
-                    BufReleaseRef(buf);
-                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
-                }
-                break;
-
-            case c_func_ptr:
-                if (m_vars[i].direction.out) {
-                    m_out.send_func_ptr(*((void**) m_vars[i].ptr));
-                }
-                break;
-
-            case c_dv_data:
-            case c_dv_ptr_data:
-            case c_dv_data_slice:
-            case c_dv_ptr_data_slice:
-                if (src_is_for_mic &&
-                    m_vars[i].free_if &&
-                    !m_vars[i].flags.is_static) {
-                    ArrDesc *dvp = (m_vars[i].type.src == c_dv_data ||
-                                    m_vars[i].type.src == c_dv_data_slice) ?
-                        static_cast<ArrDesc*>(m_vars[i].ptr) :
-                        *static_cast<ArrDesc**>(m_vars[i].ptr);
-
-                    void *buf = reinterpret_cast<char*>(dvp->Base) -
-                                m_vars[i].mic_offset -
-                                m_vars[i].offset;
-
-                    if (buf == NULL) {
-                        break;
-                    }
-
-                    // decrement buffer reference count
-                    OFFLOAD_TIMER_START(c_offload_target_release_buffer_refs);
-                    BufReleaseRef(buf);
-                    OFFLOAD_TIMER_STOP(c_offload_target_release_buffer_refs);
-                }
-                break;
-
-            default:
-                LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
-                abort();
-        }
-
-        if (m_vars[i].into) {
-            switch (m_vars[i].type.dst) {
-                case c_data_ptr_array:
-                    break;
-                case c_data:
-                case c_void_ptr:
-                case c_cean_var:
-                case c_dv:
-                    break;
-
-                case c_string_ptr:
-                case c_data_ptr:
-                case c_cean_var_ptr:
-                case c_dv_ptr:
-                    if (m_vars[i].direction.in &&
-                        m_vars[i].free_if &&
-                        !m_vars[i].flags.is_static_dstn) {
-                        void *buf = *static_cast<char**>(m_vars[i].into) -
-                                    m_vars[i].mic_offset -
-                                    (m_vars[i].flags.is_stack_buf?
-                                     0 : m_vars[i].offset);
-
-                        if (buf == NULL) {
-                            break;
-                        }
-                        // decrement buffer reference count
-                        OFFLOAD_TIMER_START(
-                            c_offload_target_release_buffer_refs);
-                        BufReleaseRef(buf);
-                        OFFLOAD_TIMER_STOP(
-                            c_offload_target_release_buffer_refs);
-                    }
-                    break;
-
-                case c_func_ptr:
-                    break;
-
-                case c_dv_data:
-                case c_dv_ptr_data:
-                case c_dv_data_slice:
-                case c_dv_ptr_data_slice:
-                    if (m_vars[i].free_if &&
-                        m_vars[i].direction.in &&
-                        !m_vars[i].flags.is_static_dstn) {
-                        ArrDesc *dvp =
-                            (m_vars[i].type.dst == c_dv_data_slice ||
-                             m_vars[i].type.dst == c_dv_data) ?
-                            static_cast<ArrDesc*>(m_vars[i].into) :
-                            *static_cast<ArrDesc**>(m_vars[i].into);
-                        void *buf = reinterpret_cast<char*>(dvp->Base) -
-                              m_vars[i].mic_offset -
-                              m_vars[i].offset;
-
-                        if (buf == NULL) {
-                            break;
-                        }
-                        // decrement buffer reference count
-                        OFFLOAD_TIMER_START(
-                            c_offload_target_release_buffer_refs);
-                        BufReleaseRef(buf);
-                        OFFLOAD_TIMER_STOP(
-                            c_offload_target_release_buffer_refs);
-                    }
-                    break;
-
-                default:
-                    LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars[i].type.dst);
-                    abort();
-            }
-        }
-    }
-
-    OFFLOAD_DEBUG_TRACE(2, "OUT buffer @ p %p size %lld\n",
-                        m_out.get_buffer_start(),
-                        m_out.get_buffer_size());
-
-    OFFLOAD_DEBUG_DUMP_BYTES(2,
-                             m_out.get_buffer_start(),
-                             m_out.get_buffer_size());
-
-    OFFLOAD_DEBUG_TRACE_1(1, get_offload_number(), c_offload_copyout_data,
-                  "Total copyout data sent to host: [%lld] bytes\n",
-                  m_out.get_tfr_size());
-
-    OFFLOAD_TIMER_STOP(c_offload_target_gather_outputs);
-}
-
-void __offload_target_init(void)
-{
-#ifdef SEP_SUPPORT
-    const char* env_var = getenv(sep_monitor_env);
-    if (env_var != 0 && *env_var != '\0') {
-        sep_monitor = atoi(env_var);
-    }
-    env_var = getenv(sep_device_env);
-    if (env_var != 0 && *env_var != '\0') {
-        sep_device = env_var;
-    }
-#endif // SEP_SUPPORT
-
-    prefix = report_get_message_str(c_report_mic);
-
-    // init frequency
-    mic_frequency = COIPerfGetCycleFrequency();
-}
-
-// User-visible offload API
-
-int _Offload_number_of_devices(void)
-{
-    return mic_engines_total;
-}
-
-int _Offload_get_device_number(void)
-{
-    return mic_index;
-}
-
-int _Offload_get_physical_device_number(void)
-{
-    uint32_t index;
-    EngineGetIndex(&index);
-    return index;
-}
diff --git a/offload/src/offload_target.h b/offload/src/offload_target.h
deleted file mode 100644
index 7db314742..000000000
--- a/offload/src/offload_target.h
+++ /dev/null
@@ -1,100 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The parts of the offload library used only on the target
-
-#ifndef OFFLOAD_TARGET_H_INCLUDED
-#define OFFLOAD_TARGET_H_INCLUDED
-
-#include "offload_common.h"
-#include "coi/coi_server.h"
-
-// The offload descriptor.
-class OffloadDescriptor
-{
-public:
-    ~OffloadDescriptor() {
-        if (m_vars != 0) {
-            free(m_vars);
-        }
-    }
-
-    // Entry point for COI. Synchronously execute offloaded region given
-    // the provided buffers, misc and return data.
-    static void offload(
-        uint32_t  buffer_count,
-        void**    buffers,
-        void*     misc_data,
-        uint16_t  misc_data_len,
-        void*     return_data,
-        uint16_t  return_data_len
-    );
-
-    // scatters input data from in buffer to target variables
-    void scatter_copyin_data();
-
-    // gathers output data to the buffer
-    void gather_copyout_data();
-
-    // merges local variable descriptors with the descriptors received from
-    // host
-    void merge_var_descs(VarDesc *vars, VarDesc2 *vars2, int vars_total);
-
-    int get_offload_number() const {
-        return m_offload_number;
-    }
-
-    void set_offload_number(int number) {
-        m_offload_number = number;
-    }
-
-private:
-    // Constructor
-    OffloadDescriptor() : m_vars(0)
-    {}
-
-private:
-    typedef std::list<void*> BufferList;
-
-    // The Marshaller for the inputs of the offloaded region.
-    Marshaller m_in;
-
-    // The Marshaller for the outputs of the offloaded region.
-    Marshaller m_out;
-
-    // List of buffers that are passed to dispatch call
-    BufferList m_buffers;
-
-    // Variable descriptors received from host
-    VarDesc* m_vars;
-    int      m_vars_total;
-    int      m_offload_number;
-};
-
-// one time target initialization in main
-extern void __offload_target_init(void);
-
-// logical device index
-extern int mic_index;
-
-// total number of available logical devices
-extern int mic_engines_total;
-
-// device frequency (from COI)
-extern uint64_t mic_frequency;
-
-struct RefInfo {
-    RefInfo(bool is_add, long amount):is_added(is_add),count(amount)
-    {}
-    bool is_added;
-    long count;
-};
-
-#endif // OFFLOAD_TARGET_H_INCLUDED
diff --git a/offload/src/offload_target_main.cpp b/offload/src/offload_target_main.cpp
deleted file mode 100644
index a4921d275..000000000
--- a/offload/src/offload_target_main.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-extern "C" void __offload_target_main(void);
-
-int main(int argc, char ** argv)
-{
-    __offload_target_main();
-    return 0;
-}
diff --git a/offload/src/offload_timer.h b/offload/src/offload_timer.h
deleted file mode 100644
index 1401a9db0..000000000
--- a/offload/src/offload_timer.h
+++ /dev/null
@@ -1,172 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_TIMER_H_INCLUDED
-#define OFFLOAD_TIMER_H_INCLUDED
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <stdint.h>
-#include "liboffload_error_codes.h"
-
-extern int timer_enabled;
-
-#ifdef TIMING_SUPPORT
-
-struct OffloadTargetTimerData {
-    uint64_t frequency;
-    struct {
-        uint64_t start;
-        uint64_t total;
-    } phases[c_offload_target_max_phase];
-};
-
-struct OffloadHostTimerData {
-    // source file name and line number
-    const char* file;
-    int         line;
-
-    // host timer data
-    struct {
-        uint64_t start;
-        uint64_t total;
-    } phases[c_offload_host_max_phase];
-
-    uint64_t sent_bytes;
-    uint64_t received_bytes;
-    int card_number;
-    int offload_number;
-
-    // target timer data
-    OffloadTargetTimerData target;
-
-    // next element
-    OffloadHostTimerData *next;
-};
-
-#if HOST_LIBRARY
-
-extern int offload_report_level;
-extern int offload_report_enabled;
-#define OFFLOAD_REPORT_1 1
-#define OFFLOAD_REPORT_2 2
-#define OFFLOAD_REPORT_3 3
-#define OFFLOAD_REPORT_ON 1
-#define OFFLOAD_REPORT_OFF 0
-
-#define OFFLOAD_TIMER_DATALEN() \
-    ((timer_enabled || (offload_report_level && offload_report_enabled)) ? \
-     ((1 + c_offload_target_max_phase) * sizeof(uint64_t)) : 0)
-
-#define OFFLOAD_TIMER_START(timer_data, pnode) \
-    if (timer_enabled || \
-        (offload_report_level && offload_report_enabled)) { \
-        offload_timer_start(timer_data, pnode); \
-    }
-
-#define OFFLOAD_TIMER_STOP(timer_data, pnode) \
-    if (timer_enabled || \
-        (offload_report_level && offload_report_enabled)) { \
-        offload_timer_stop(timer_data, pnode); \
-    }
-
-#define OFFLOAD_TIMER_INIT(file, line) \
-    offload_timer_init(file, line);
-
-#define OFFLOAD_TIMER_TARGET_DATA(timer_data, data) \
-    if (timer_enabled || \
-        (offload_report_level && offload_report_enabled)) { \
-        offload_timer_fill_target_data(timer_data, data); \
-    }
-
-#define OFFLOAD_TIMER_HOST_SDATA(timer_data, data) \
-    if (offload_report_level && offload_report_enabled) { \
-        offload_timer_fill_host_sdata(timer_data, data); \
-    }
-
-#define OFFLOAD_TIMER_HOST_RDATA(timer_data, data) \
-    if (offload_report_level && offload_report_enabled) { \
-        offload_timer_fill_host_rdata(timer_data, data); \
-    }
-
-#define OFFLOAD_TIMER_HOST_MIC_NUM(timer_data, data) \
-    if (offload_report_level && offload_report_enabled) { \
-        offload_timer_fill_host_mic_num(timer_data, data); \
-    }
-
-extern void offload_timer_start(OffloadHostTimerData *,
-                                OffloadHostPhase t_node);
-extern void offload_timer_stop(OffloadHostTimerData *,
-                               OffloadHostPhase t_node);
-extern OffloadHostTimerData * offload_timer_init(const char *file, int line);
-extern void offload_timer_fill_target_data(OffloadHostTimerData *,
-                                           void *data);
-extern void offload_timer_fill_host_sdata(OffloadHostTimerData *,
-                                          uint64_t sent_bytes);
-extern void offload_timer_fill_host_rdata(OffloadHostTimerData *,
-                                          uint64_t sent_bytes);
-extern void offload_timer_fill_host_mic_num(OffloadHostTimerData *,
-                                            int card_number);
-
-// Utility structure for starting/stopping timer
-struct OffloadTimer {
-    OffloadTimer(OffloadHostTimerData *data, OffloadHostPhase phase) :
-        m_data(data),
-        m_phase(phase)
-    {
-        OFFLOAD_TIMER_START(m_data, m_phase);
-    }
-
-    ~OffloadTimer()
-    {
-        OFFLOAD_TIMER_STOP(m_data, m_phase);
-    }
-
-private:
-    OffloadHostTimerData*   m_data;
-    OffloadHostPhase        m_phase;
-};
-
-#else
-
-#define OFFLOAD_TIMER_DATALEN() \
-    ((timer_enabled) ? \
-     ((1 + c_offload_target_max_phase) * sizeof(uint64_t)) : 0)
-
-#define OFFLOAD_TIMER_START(pnode) \
-    if (timer_enabled) offload_timer_start(pnode);
-
-#define OFFLOAD_TIMER_STOP(pnode) \
-    if (timer_enabled) offload_timer_stop(pnode);
-
-#define OFFLOAD_TIMER_INIT() \
-    if (timer_enabled) offload_timer_init();
-
-#define OFFLOAD_TIMER_TARGET_DATA(data) \
-    if (timer_enabled) offload_timer_fill_target_data(data);
-
-extern void offload_timer_start(OffloadTargetPhase t_node);
-extern void offload_timer_stop(OffloadTargetPhase t_node);
-extern void offload_timer_init(void);
-extern void offload_timer_fill_target_data(void *data);
-
-#endif // HOST_LIBRARY
-
-#else // TIMING_SUPPORT
-
-#define OFFLOAD_TIMER_START(...)
-#define OFFLOAD_TIMER_STOP(...)
-#define OFFLOAD_TIMER_INIT(...)
-#define OFFLOAD_TIMER_TARGET_DATA(...)
-#define OFFLOAD_TIMER_DATALEN(...)      (0)
-
-#endif // TIMING_SUPPORT
-
-#endif // OFFLOAD_TIMER_H_INCLUDED
diff --git a/offload/src/offload_timer_host.cpp b/offload/src/offload_timer_host.cpp
deleted file mode 100644
index fb27db080..000000000
--- a/offload/src/offload_timer_host.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_timer.h"
-
-#ifdef __INTEL_COMPILER
-#include <ia32intrin.h>
-#else // __INTEL_COMPILER
-#include <x86intrin.h>
-#endif // __INTEL_COMPILER
-
-#include "offload_host.h"
-#include <sstream>
-#include <iostream>
-#include <iomanip>
-
-int timer_enabled = 0;
-
-#ifdef TIMING_SUPPORT
-
-int offload_report_level = 0;
-int offload_report_enabled = 1;
-
-static const int host_timer_prefix_spaces[] = {
-    /*c_offload_host_setup_buffers*/         0,
-    /*c_offload_host_initialize*/            2,
-    /*c_offload_host_target_acquire*/        2,
-    /*c_offload_host_wait_deps*/             2,
-    /*c_offload_host_setup_buffers*/         2,
-    /*c_offload_host_alloc_buffers*/         4,
-    /*c_offload_host_setup_misc_data*/       2,
-    /*c_offload_host_alloc_data_buffer*/     4,
-    /*c_offload_host_send_pointers*/         2,
-    /*c_offload_host_gather_inputs*/         2,
-    /*c_offload_host_map_in_data_buffer*/    4,
-    /*c_offload_host_unmap_in_data_buffer*/  4,
-    /*c_offload_host_start_compute*/         2,
-    /*c_offload_host_wait_compute*/          2,
-    /*c_offload_host_start_buffers_reads*/   2,
-    /*c_offload_host_scatter_outputs*/       2,
-    /*c_offload_host_map_out_data_buffer*/   4,
-    /*c_offload_host_unmap_out_data_buffer*/ 4,
-    /*c_offload_host_wait_buffers_reads*/    2,
-    /*c_offload_host_destroy_buffers*/       2
-};
-
-const static int target_timer_prefix_spaces[] = {
-/*c_offload_target_total_time*/          0,
-/*c_offload_target_descriptor_setup*/    2,
-/*c_offload_target_func_lookup*/         2,
-/*c_offload_target_func_time*/           2,
-/*c_offload_target_scatter_inputs*/      4,
-/*c_offload_target_add_buffer_refs*/     6,
-/*c_offload_target_compute*/             4,
-/*c_offload_target_gather_outputs*/      4,
-/*c_offload_target_release_buffer_refs*/ 6
-};
-
-static OffloadHostTimerData* timer_data_head;
-static OffloadHostTimerData* timer_data_tail;
-static mutex_t               timer_data_mutex;
-
-static void offload_host_phase_name(std::stringstream &ss, int p_node);
-static void offload_target_phase_name(std::stringstream &ss, int p_node);
-
-extern void Offload_Timer_Print(void)
-{
-    std::string       buf;
-    std::stringstream ss;
-    const char *stars =
-        "**************************************************************";
-
-    ss << "\n\n" << stars << "\n";
-    ss << "                             ";
-    ss << report_get_message_str(c_report_title) << "\n";
-    ss << stars << "\n";
-    double frequency = cpu_frequency;
-
-    for (OffloadHostTimerData *pnode = timer_data_head;
-         pnode != 0; pnode = pnode->next) {
-        ss << "      ";
-        ss << report_get_message_str(c_report_from_file) << " "<< pnode->file;
-        ss << report_get_message_str(c_report_line) << " " << pnode->line;
-        ss << "\n";
-        for (int i = 0; i < c_offload_host_max_phase ; i++) {
-            ss << "          ";
-            offload_host_phase_name(ss, i);
-            ss << "   " << std::fixed << std::setprecision(5);
-            ss << (double)pnode->phases[i].total / frequency << "\n";
-        }
-
-        for (int i = 0; i < c_offload_target_max_phase ; i++) {
-            double time = 0;
-            if (pnode->target.frequency != 0) {
-                time = (double) pnode->target.phases[i].total /
-                       (double) pnode->target.frequency;
-            }
-            ss << "          ";
-            offload_target_phase_name(ss, i);
-            ss << "   " << std::fixed << std::setprecision(5);
-            ss << time << "\n";
-        }
-    }
-
-    buf = ss.str();
-    fprintf(stdout, buf.data());
-    fflush(stdout);
-}
-
-extern void Offload_Report_Prolog(OffloadHostTimerData *pnode)
-{
-    double frequency = cpu_frequency;
-    std::string       buf;
-    std::stringstream ss;
-
-    if (pnode) {
-        // [Offload] [Mic 0] [File]          file.c
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic) << " ";
-        ss << pnode->card_number << "] [";
-        ss << report_get_message_str(c_report_file);
-        ss << "]                    " << pnode->file << "\n";
-
-        // [Offload] [Mic 0] [Line]          1234
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic) << " ";
-        ss << pnode->card_number << "] [";
-        ss << report_get_message_str(c_report_line);
-        ss << "]                    " << pnode->line << "\n";
-
-        // [Offload] [Mic 0] [Tag]          Tag 1
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic) << " ";
-        ss << pnode->card_number << "] [";
-        ss << report_get_message_str(c_report_tag);
-        ss << "]                     " << report_get_message_str(c_report_tag);
-        ss << " " << pnode->offload_number << "\n";
-
-        buf = ss.str();
-        fprintf(stdout, buf.data());
-        fflush(stdout);
-    }
-}
-
-extern void Offload_Report_Epilog(OffloadHostTimerData * timer_data)
-{
-    double frequency = cpu_frequency;
-    std::string       buf;
-    std::stringstream ss;
-
-    OffloadHostTimerData *pnode = timer_data;
-
-    if (!pnode) {
-        return;
-    }
-    ss << "[" << report_get_message_str(c_report_offload) << "] [";
-    ss << report_get_message_str(c_report_host) << "]  [";
-    ss << report_get_message_str(c_report_tag) <<  " ";
-    ss << pnode->offload_number << "] [";
-    ss << report_get_message_str(c_report_cpu_time) << "]        ";
-    ss << std::fixed << std::setprecision(6);
-    ss << (double) pnode->phases[0].total / frequency;
-    ss << report_get_message_str(c_report_seconds) << "\n";
-
-    if (offload_report_level >= OFFLOAD_REPORT_2) {
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic);
-        ss << " " << pnode->card_number;
-        ss << "] [" << report_get_message_str(c_report_tag) << " ";
-        ss <<  pnode->offload_number << "] [";
-        ss << report_get_message_str(c_report_cpu_to_mic_data) << "]   ";
-        ss << pnode->sent_bytes << " ";
-        ss << report_get_message_str(c_report_bytes) << "\n";
-    }
-
-    double time = 0;
-    if (pnode->target.frequency != 0) {
-        time = (double) pnode->target.phases[0].total /
-            (double) pnode->target.frequency;
-    }
-    ss << "[" << report_get_message_str(c_report_offload) << "] [";
-    ss << report_get_message_str(c_report_mic) << " ";
-    ss << pnode->card_number<< "] [";
-    ss << report_get_message_str(c_report_tag) <<  " ";
-    ss << pnode->offload_number << "] [";
-    ss << report_get_message_str(c_report_mic_time) << "]        ";
-    ss << std::fixed << std::setprecision(6) << time;
-    ss << report_get_message_str(c_report_seconds) << "\n";
-
-    if (offload_report_level >= OFFLOAD_REPORT_2) {
-        ss << "[" << report_get_message_str(c_report_offload) << "] [";
-        ss << report_get_message_str(c_report_mic);
-        ss << " " << pnode->card_number;
-        ss << "] [" << report_get_message_str(c_report_tag) << " ";
-        ss <<  pnode->offload_number << "] [";
-        ss << report_get_message_str(c_report_mic_to_cpu_data) << "]   ";
-        ss << pnode->received_bytes << " ";
-        ss << report_get_message_str(c_report_bytes) << "\n";
-    }
-    ss << "\n";
-
-    buf = ss.str();
-    fprintf(stdout, buf.data());
-    fflush(stdout);
-
-    offload_report_free_data(timer_data);
-}
-
-extern void offload_report_free_data(OffloadHostTimerData * timer_data)
-{
-    OffloadHostTimerData *pnode_last = NULL;
-
-    for (OffloadHostTimerData *pnode = timer_data_head;
-         pnode != 0; pnode = pnode->next) {
-        if (timer_data == pnode) {
-            if (pnode_last) {
-                pnode_last->next = pnode->next;
-            }
-            else {
-                timer_data_head = pnode->next;
-            }
-            OFFLOAD_FREE(pnode);
-            break;
-        }
-        pnode_last = pnode;
-    }
-}
-
-static void fill_buf_with_spaces(std::stringstream &ss, int num)
-{
-    for (; num > 0; num--) {
-        ss << " ";
-    }
-}
-
-static void offload_host_phase_name(std::stringstream &ss, int p_node)
-{
-    int prefix_spaces;
-    int str_length;
-    int tail_length;
-    const int message_length = 40;
-    char const *str;
-
-    str = report_get_host_stage_str(p_node);
-    prefix_spaces = host_timer_prefix_spaces[p_node];
-    fill_buf_with_spaces(ss, prefix_spaces);
-    str_length = strlen(str);
-    ss << str;
-    tail_length = message_length - prefix_spaces - str_length;
-    tail_length = tail_length > 0? tail_length : 1;
-    fill_buf_with_spaces(ss, tail_length);
-}
-
-static void offload_target_phase_name(std::stringstream &ss, int p_node)
-{
-    int prefix_spaces;
-    int str_length;
-    const int message_length = 40;
-    int tail_length;
-    char const *str;
-
-    str = report_get_target_stage_str(p_node);
-    prefix_spaces = target_timer_prefix_spaces[p_node];
-    fill_buf_with_spaces(ss, prefix_spaces);
-    str_length = strlen(str);
-    ss << str;
-    tail_length = message_length - prefix_spaces - str_length;
-    tail_length = (tail_length > 0)? tail_length : 1;
-    fill_buf_with_spaces(ss, tail_length);
-}
-
-void offload_timer_start(OffloadHostTimerData * timer_data,
-                         OffloadHostPhase p_type)
-{
-    timer_data->phases[p_type].start = _rdtsc();
-}
-
-void offload_timer_stop(OffloadHostTimerData * timer_data,
-                        OffloadHostPhase p_type)
-{
-    timer_data->phases[p_type].total += _rdtsc() -
-                                        timer_data->phases[p_type].start;
-}
-
-void offload_timer_fill_target_data(OffloadHostTimerData * timer_data,
-                                    void *buf)
-{
-    uint64_t *data = (uint64_t*) buf;
-
-    timer_data->target.frequency = *data++;
-    for (int i = 0; i < c_offload_target_max_phase; i++) {
-        timer_data->target.phases[i].total = *data++;
-    }
-}
-
-void offload_timer_fill_host_sdata(OffloadHostTimerData * timer_data,
-                                   uint64_t sent_bytes)
-{
-    if (timer_data) {
-        timer_data->sent_bytes += sent_bytes;
-    }
-}
-
-void offload_timer_fill_host_rdata(OffloadHostTimerData * timer_data,
-                                   uint64_t received_bytes)
-{
-    if (timer_data) {
-        timer_data->received_bytes += received_bytes;
-    }
-}
-
-void offload_timer_fill_host_mic_num(OffloadHostTimerData * timer_data,
-                                     int card_number)
-{
-    if (timer_data) {
-        timer_data->card_number = card_number;
-    }
-}
-
-OffloadHostTimerData* offload_timer_init(const char *file, int line)
-{
-    static bool first_time = true;
-    OffloadHostTimerData* timer_data = NULL;
-
-    timer_data_mutex.lock();
-    {
-        if (timer_enabled ||
-            (offload_report_level && offload_report_enabled)) {
-            timer_data = (OffloadHostTimerData*)
-                OFFLOAD_MALLOC(sizeof(OffloadHostTimerData), 0);
-            memset(timer_data, 0, sizeof(OffloadHostTimerData));
-
-            timer_data->offload_number = OFFLOAD_DEBUG_INCR_OFLD_NUM() - 1;
-
-            if (timer_data_head == 0) {
-                timer_data_head = timer_data;
-                timer_data_tail = timer_data;
-            }
-            else {
-                timer_data_tail->next = timer_data;
-                timer_data_tail = timer_data;
-            }
-
-            timer_data->file = file;
-            timer_data->line = line;
-        }
-    }
-    timer_data_mutex.unlock();
-    return timer_data;
-}
-
-#endif // TIMING_SUPPORT
diff --git a/offload/src/offload_timer_target.cpp b/offload/src/offload_timer_target.cpp
deleted file mode 100644
index 30a4c9124..000000000
--- a/offload/src/offload_timer_target.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_timer.h"
-#include "offload_target.h"
-
-#ifdef __INTEL_COMPILER
-#include <ia32intrin.h>
-#else // __INTEL_COMPILER
-#include <x86intrin.h>
-#endif // __INTEL_COMPILER
-
-
-
-int timer_enabled = 0;
-
-#ifdef TIMING_SUPPORT
-
-#if defined(LINUX) || defined(FREEBSD)
-static __thread OffloadTargetTimerData timer_data;
-#else // WINNT
-static __declspec(thread) OffloadTargetTimerData timer_data;
-#endif // defined(LINUX) || defined(FREEBSD)
-
-
-void offload_timer_start(
-    OffloadTargetPhase p_type
-)
-{
-    timer_data.phases[p_type].start = _rdtsc();
-}
-
-void offload_timer_stop(
-    OffloadTargetPhase p_type
-)
-{
-    timer_data.phases[p_type].total += _rdtsc() -
-                                       timer_data.phases[p_type].start;
-}
-
-void offload_timer_init()
-{
-    memset(&timer_data, 0, sizeof(OffloadTargetTimerData));
-}
-
-void offload_timer_fill_target_data(
-    void *buf
-)
-{
-    uint64_t *data = (uint64_t*) buf;
-
-    timer_data.frequency = mic_frequency;
-    memcpy(data++, &(timer_data.frequency), sizeof(uint64_t));
-
-    for (int i = 0; i < c_offload_target_max_phase; i++) {
-        memcpy(data++, &(timer_data.phases[i].total), sizeof(uint64_t));
-    }
-}
-
-#endif // TIMING_SUPPORT
diff --git a/offload/src/offload_trace.cpp b/offload/src/offload_trace.cpp
deleted file mode 100644
index 0a0620491..000000000
--- a/offload/src/offload_trace.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_trace.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <sstream>
-#include "liboffload_error_codes.h"
-
-extern const char *prefix;
-
-#if !HOST_LIBRARY
-extern int mic_index;
-#endif
-
-// The debug routines
-
-static const char * offload_stage(std::stringstream &ss,
-                                  int offload_number,
-                                  const char *tag,
-                                  const char *text,
-                                  bool print_tag)
-{
-    ss << "[" << report_get_message_str(c_report_offload) << "]";
-#if HOST_LIBRARY
-    ss << " [" << prefix << "]";
-    if (print_tag) {
-        ss << "  [" << report_get_message_str(c_report_tag);
-        ss << " " << offload_number << "]";
-    }
-    else {
-        ss << "         ";
-    }
-    ss << " [" << tag << "]";
-    ss << "           " << text;
-#else
-    ss << " [" << prefix << " " << mic_index << "]";
-    if (print_tag) {
-        ss << " [" << report_get_message_str(c_report_tag);
-        ss << " " << offload_number << "]";
-    }
-    ss << " [" << tag << "]";
-    ss << "           " << text;
-#endif
-    return 0;
-}
-
-static const char * offload_signal(std::stringstream &ss,
-                                  int offload_number,
-                                  const char *tag,
-                                  const char *text)
-{
-    ss << "[" << report_get_message_str(c_report_offload) << "]";
-    ss << " [" << prefix << "]";
-    ss << "  [" << report_get_message_str(c_report_tag);
-    ss << " " << offload_number << "]";
-    ss << " [" << tag << "]";
-    ss << "          " << text;
-    return 0;
-}
-
-void offload_stage_print(int stage, int offload_number, ...)
-{
-    std::string buf;
-    std::stringstream ss;
-    char const *str1;
-    char const *str2;
-    va_list va_args;
-    va_start(va_args, offload_number);
-    va_arg(va_args, char*);
-
-    switch (stage) {
-        case c_offload_start:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_start);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_init:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_init);
-            offload_stage(ss, offload_number, str1, str2, false);
-            ss << " " << report_get_message_str(c_report_logical_card);
-            ss << " " << va_arg(va_args, int);
-            ss << " = " << report_get_message_str(c_report_physical_card);
-            ss << " " << va_arg(va_args, int);
-            break;
-        case c_offload_register:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_register);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_init_func:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_init_func);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": " << va_arg(va_args, char*);
-            break;
-        case c_offload_create_buf_host:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_create_buf_host);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": base=0x" << std::hex << va_arg(va_args, uint64_t);
-            ss << " length=" << std::dec << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_create_buf_mic:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_create_buf_mic);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": size=" << va_arg(va_args, uint64_t);
-            ss << " offset=" << va_arg(va_args, int);
-            if (va_arg(va_args,int))
-               ss << " (2M page)";
-            break;
-        case c_offload_send_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_send_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_sent_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_sent_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << " " << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_gather_copyin_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_gather_copyin_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_copyin_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_copyin_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << " " << va_arg(va_args, uint64_t) << " ";
-            break;
-        case c_offload_compute:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_compute);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_receive_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_receive_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_received_pointer_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_received_pointer_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << " " << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_start_target_func:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_start_target_func);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << ": " << va_arg(va_args, char*);
-            break;
-        case c_offload_var:
-            str1 = report_get_message_str(c_report_var);
-            offload_stage(ss, offload_number, str1, "  ", true);
-            va_arg(va_args, int);
-            ss << va_arg(va_args, char*);
-            ss << " " << " " << va_arg(va_args, char*);
-            break;
-        case c_offload_scatter_copyin_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_scatter_copyin_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_gather_copyout_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_gather_copyout_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_scatter_copyout_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_scatter_copyout_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_copyout_data:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_copyout_data);
-            offload_stage(ss, offload_number, str1, str2, true);
-            ss << "   " << va_arg(va_args, uint64_t);
-            break;
-        case c_offload_signal:
-            {
-                uint64_t  *signal;
-                str1 = report_get_message_str(c_report_state_signal);
-                str2 = report_get_message_str(c_report_signal);
-                offload_signal(ss, offload_number, str1, str2);
-	        signal = va_arg(va_args, uint64_t*);
-	        if (signal)
-                   ss << " 0x" << std::hex << *signal;
-                else
-                   ss << " none";
-            }
-            break;
-        case c_offload_wait:
-            {
-                int count;
-                uint64_t  **signal;
-                str1 = report_get_message_str(c_report_state_signal);
-                str2 = report_get_message_str(c_report_wait);
-                offload_signal(ss, offload_number, str1, str2);
-                count = va_arg(va_args, int);
-                signal = va_arg(va_args, uint64_t**);
-                if (count) {
-                    while (count) {
-                        ss << " " << std::hex << signal[count-1];
-                        count--;
-                    }
-                }
-                else
-                    ss << " none";
-            }
-            break;
-        case c_offload_unregister:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_unregister);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_destroy:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_destroy);
-            offload_stage(ss, offload_number, str1, str2, true);
-            break;
-        case c_offload_myoinit:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myoinit);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myoregister:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myoregister);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myofini:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myofini);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_mic_myo_shared:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_mic_myo_shared);
-            offload_stage(ss, offload_number, str1, str2, false);
-            ss << " " << va_arg(va_args, char*);
-            break;
-        case c_offload_mic_myo_fptr:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_mic_myo_fptr);
-            offload_stage(ss, offload_number, str1, str2, false);
-            ss << " " << va_arg(va_args, char*);
-            break;
-        case c_offload_myosharedmalloc:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedmalloc);
-            offload_stage(ss, offload_number, str1, str2, false);
-            va_arg(va_args, char*);
-            ss << " " << va_arg(va_args, size_t);
-            break;
-        case c_offload_myosharedfree:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedfree);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myosharedalignedmalloc:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedalignedmalloc);
-            offload_stage(ss, offload_number, str1, str2, false);
-            va_arg(va_args, char*);
-            ss << " " << va_arg(va_args, size_t);
-            ss << " " << va_arg(va_args, size_t);
-            break;
-        case c_offload_myosharedalignedfree:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myosharedalignedfree);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myoacquire:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myoacquire);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        case c_offload_myorelease:
-            str1 = report_get_message_str(c_report_state);
-            str2 = report_get_message_str(c_report_myorelease);
-            offload_stage(ss, offload_number, str1, str2, false);
-            break;
-        default:
-            LIBOFFLOAD_ERROR(c_report_unknown_trace_node);
-            abort();
-    }
-    ss << "\n";
-    buf = ss.str();
-    fprintf(stdout, buf.data());
-    fflush(stdout);
-
-    va_end(va_args);
-    return;
-}
diff --git a/offload/src/offload_trace.h b/offload/src/offload_trace.h
deleted file mode 100644
index 65c28a4e3..000000000
--- a/offload/src/offload_trace.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-// The parts of the offload library common to host and target
-
-void offload_stage_print(int stage, int offload_number, ...);
-
-enum OffloadTraceStage {
-    // Total time spent on the target
-    c_offload_start = 0,
-    c_offload_init,
-    c_offload_register,
-    c_offload_init_func,
-    c_offload_create_buf_host,
-    c_offload_create_buf_mic,
-    c_offload_send_pointer_data,
-    c_offload_sent_pointer_data,
-    c_offload_gather_copyin_data,
-    c_offload_copyin_data,
-    c_offload_compute,
-    c_offload_receive_pointer_data,
-    c_offload_received_pointer_data,
-    c_offload_start_target_func,
-    c_offload_var,
-    c_offload_scatter_copyin_data,
-    c_offload_gather_copyout_data,
-    c_offload_scatter_copyout_data,
-    c_offload_copyout_data,
-    c_offload_signal,
-    c_offload_wait,
-    c_offload_unregister,
-    c_offload_destroy,
-    c_offload_finish,
-    c_offload_myoinit,
-    c_offload_myoregister,
-    c_offload_mic_myo_shared,
-    c_offload_mic_myo_fptr,
-    c_offload_myosharedmalloc,
-    c_offload_myosharedfree,
-    c_offload_myosharedalignedmalloc,
-    c_offload_myosharedalignedfree,
-    c_offload_myoacquire,
-    c_offload_myorelease,
-    c_offload_myofini
-};
diff --git a/offload/src/offload_util.cpp b/offload/src/offload_util.cpp
deleted file mode 100644
index 68462c805..000000000
--- a/offload/src/offload_util.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "offload_util.h"
-#include <errno.h>
-#include "liboffload_error_codes.h"
-
-#ifdef TARGET_WINNT
-void *thread_getspecific(pthread_key_t key)
-{
-    if (key == 0) {
-        return NULL;
-    }
-    else {
-        return TlsGetValue(key);
-    }
-}
-
-int thread_setspecific(pthread_key_t key, const void *value)
-{
-    return (TlsSetValue(key, (LPVOID)value)) ? 0 : GetLastError();
-}
-#endif // TARGET_WINNT
-
-bool __offload_parse_size_string(const char *str, uint64_t &new_size)
-{
-    uint64_t val;
-    char *suffix;
-
-    errno = 0;
-#ifdef TARGET_WINNT
-    val = strtoul(str, &suffix, 10);
-#else // TARGET_WINNT
-    val = strtoull(str, &suffix, 10);
-#endif // TARGET_WINNT
-    if (errno != 0 || suffix == str) {
-        return false;
-    }
-
-    if (suffix[0] == '\0') {
-        // default is Kilobytes
-        new_size = val * 1024;
-        return true;
-    }
-    else if (suffix[1] == '\0') {
-        // Optional suffixes: B (bytes), K (Kilobytes), M (Megabytes),
-        // G (Gigabytes), or T (Terabytes) specify the units.
-        switch (suffix[0]) {
-            case 'b':
-            case 'B':
-                new_size = val;
-                break;
-
-            case 'k':
-            case 'K':
-                new_size = val * 1024;
-                break;
-
-            case 'm':
-            case 'M':
-                new_size = val * 1024 * 1024;
-                break;
-
-            case 'g':
-            case 'G':
-                new_size = val * 1024 * 1024 * 1024;
-                break;
-
-            case 't':
-            case 'T':
-                new_size = val * 1024 * 1024 * 1024 * 1024;
-                break;
-
-            default:
-                return false;
-        }
-        return true;
-    }
-
-    return false;
-}
-
-bool __offload_parse_int_string(const char *str, int64_t &value)
-{
-    int64_t val;
-    char *suffix;
-
-    errno = 0;
-#ifdef TARGET_WINNT
-    val = strtol(str, &suffix, 0);
-#else
-    val = strtoll(str, &suffix, 0);
-#endif
-    if (errno == 0 && suffix != str && *suffix == '\0') {
-        value = val;
-        return true;
-    }
-    return false;
-}
-
-#ifdef TARGET_WINNT
-extern void* DL_open(const char *path)
-{
-    void *handle;
-    int error_mode;
-
-    /*
-     * do not display message box with error if it the call below fails to
-     * load dynamic library.
-     */
-    error_mode = SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOOPENFILEERRORBOX);
-
-    /* load dynamic library */
-    handle = (void*) LoadLibrary(path);
-
-    /* restore error mode */
-    SetErrorMode(error_mode);
-
-    return handle;
-}
-
-extern int DL_addr(const void *addr, Dl_info *dl_info)
-{
-    MEMORY_BASIC_INFORMATION mem_info;
-    char mod_name[MAX_PATH];
-    HMODULE mod_handle;
-
-    /* Fill MEMORY_BASIC_INFORMATION struct */
-    if (!VirtualQuery(addr, &mem_info, sizeof(mem_info))) {
-        return 0;
-    }
-    mod_handle = (HMODULE)mem_info.AllocationBase;
-
-    /* ANSI file name for module */
-    if (!GetModuleFileNameA(mod_handle, (char*) mod_name, sizeof(mod_name))) {
-        return 0;
-    }
-    strcpy(dl_info->dli_fname, mod_name);
-    dl_info->dli_fbase = mem_info.BaseAddress;
-    dl_info->dli_saddr = addr;
-    strcpy(dl_info->dli_sname, mod_name);
-    return 1;
-}
-
-// Run once
-static BOOL CALLBACK __offload_run_once_wrapper(
-    PINIT_ONCE initOnce,
-    PVOID parameter,
-    PVOID *context
-)
-{
-    void (*init_routine)(void) = (void(*)(void)) parameter;
-    init_routine();
-    return true;
-}
-
-void __offload_run_once(OffloadOnceControl *ctrl, void (*func)(void))
-{
-    InitOnceExecuteOnce(ctrl, __offload_run_once_wrapper, (void*) func, 0);
-}
-#endif // TARGET_WINNT
-
-/* ARGSUSED */ // version is not used on windows
-void* DL_sym(void *handle, const char *name, const char *version)
-{
-#ifdef TARGET_WINNT
-    return GetProcAddress((HMODULE) handle, name);
-#else // TARGET_WINNT
-    if (version == 0) {
-        return dlsym(handle, name);
-    }
-    else {
-        return dlvsym(handle, name, version);
-    }
-#endif // TARGET_WINNT
-}
-
-int64_t get_el_value(
-                     char *base,
-                     int64_t offset,
-                     int64_t size)
-{
-    int64_t val = 0;
-    switch (size) {
-        case 1:
-            val = static_cast<int64_t>(*((char *)(base + offset)));
-            break;
-        case 2:
-            val = static_cast<int64_t>(*((short *)(base + offset)));
-            break;
-        case 4:
-            val = static_cast<int64_t>(*((int *)(base + offset)));
-            break;
-        default:
-            val = *((int64_t *)(base + offset));
-            break;
-    }
-    return val;
-}
diff --git a/offload/src/offload_util.h b/offload/src/offload_util.h
deleted file mode 100644
index e50d77dad..000000000
--- a/offload/src/offload_util.h
+++ /dev/null
@@ -1,153 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef OFFLOAD_UTIL_H_INCLUDED
-#define OFFLOAD_UTIL_H_INCLUDED
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-
-#ifdef TARGET_WINNT
-#include <windows.h>
-#include <process.h>
-#else // TARGET_WINNT
-#include <dlfcn.h>
-#include <pthread.h>
-#endif // TARGET_WINNT
-
-#ifdef TARGET_WINNT
-typedef unsigned pthread_key_t;
-typedef int pid_t;
-
-#define __func__ __FUNCTION__
-#define strtok_r(s,d,p) strtok_s(s,d,p)
-#define strcasecmp(a,b) stricmp(a,b)
-
-#define thread_key_create(key, destructor) \
-    (((*key = TlsAlloc()) > 0) ? 0 : GetLastError())
-#define thread_key_delete(key) TlsFree(key)
-
-#ifndef S_ISREG
-#define S_ISREG(mode)  (((mode) & S_IFMT) == S_IFREG)
-#endif
-
-void*   thread_getspecific(pthread_key_t key);
-int     thread_setspecific(pthread_key_t key, const void *value);
-#else
-#define thread_key_create(key, destructor) \
-            pthread_key_create((key), (destructor))
-#define thread_key_delete(key)  pthread_key_delete(key)
-#define thread_getspecific(key) pthread_getspecific(key)
-#define thread_setspecific(key, value) pthread_setspecific(key, value)
-#endif // TARGET_WINNT
-
-// Mutex implementation
-struct mutex_t {
-    mutex_t() {
-#ifdef TARGET_WINNT
-        InitializeCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_init(&m_lock, 0);
-#endif // TARGET_WINNT
-    }
-
-    ~mutex_t() {
-#ifdef TARGET_WINNT
-        DeleteCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_destroy(&m_lock);
-#endif // TARGET_WINNT
-    }
-
-    void lock() {
-#ifdef TARGET_WINNT
-        EnterCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_lock(&m_lock);
-#endif // TARGET_WINNT
-    }
-
-    void unlock() {
-#ifdef TARGET_WINNT
-        LeaveCriticalSection(&m_lock);
-#else // TARGET_WINNT
-        pthread_mutex_unlock(&m_lock);
-#endif // TARGET_WINNT
-    }
-
-private:
-#ifdef TARGET_WINNT
-    CRITICAL_SECTION    m_lock;
-#else
-    pthread_mutex_t     m_lock;
-#endif
-};
-
-struct mutex_locker_t {
-    mutex_locker_t(mutex_t &mutex) : m_mutex(mutex) {
-        m_mutex.lock();
-    }
-
-    ~mutex_locker_t() {
-        m_mutex.unlock();
-    }
-
-private:
-    mutex_t &m_mutex;
-};
-
-// Dynamic loader interface
-#ifdef TARGET_WINNT
-struct Dl_info
-{
-    char        dli_fname[MAX_PATH];
-    void       *dli_fbase;
-    char        dli_sname[MAX_PATH];
-    const void *dli_saddr;
-};
-
-void*   DL_open(const char *path);
-#define DL_close(handle)        FreeLibrary((HMODULE) (handle))
-int     DL_addr(const void *addr, Dl_info *info);
-#else
-#define DL_open(path)           dlopen((path), RTLD_NOW)
-#define DL_close(handle)        dlclose(handle)
-#define DL_addr(addr, info)     dladdr((addr), (info))
-#endif // TARGET_WINNT
-
-extern void* DL_sym(void *handle, const char *name, const char *version);
-
-// One-time initialization API
-#ifdef TARGET_WINNT
-typedef INIT_ONCE                   OffloadOnceControl;
-#define OFFLOAD_ONCE_CONTROL_INIT   INIT_ONCE_STATIC_INIT
-
-extern void __offload_run_once(OffloadOnceControl *ctrl, void (*func)(void));
-#else
-typedef pthread_once_t              OffloadOnceControl;
-#define OFFLOAD_ONCE_CONTROL_INIT   PTHREAD_ONCE_INIT
-
-#define __offload_run_once(ctrl, func) pthread_once(ctrl, func)
-#endif // TARGET_WINNT
-
-// Parses size specification string.
-extern bool __offload_parse_size_string(const char *str, uint64_t &new_size);
-
-// Parses string with integer value
-extern bool __offload_parse_int_string(const char *str, int64_t &value);
-
-// get value by its base, offset and size
-int64_t get_el_value(
-    char   *base,
-    int64_t offset,
-    int64_t size
-);
-#endif // OFFLOAD_UTIL_H_INCLUDED
diff --git a/offload/src/ofldbegin.cpp b/offload/src/ofldbegin.cpp
deleted file mode 100644
index 945f982a4..000000000
--- a/offload/src/ofldbegin.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if HOST_LIBRARY
-#include "offload_host.h"
-#include "offload_myo_host.h"
-#else
-#include "compiler_if_target.h"
-#include "offload_target.h"
-#include "offload_myo_target.h"
-#endif
-
-#ifdef TARGET_WINNT
-#define ALLOCATE(name) __declspec(allocate(name))
-#define DLL_LOCAL
-#else // TARGET_WINNT
-#define ALLOCATE(name) __attribute__((section(name)))
-#define DLL_LOCAL  __attribute__((visibility("hidden")))
-#endif // TARGET_WINNT
-
-#if HOST_LIBRARY
-// the host program/shared library should always have __offload_target_image
-// symbol defined. This symbol specifies the beginning of the target program
-// image.
-extern "C" DLL_LOCAL const void* __offload_target_image;
-#else // HOST_LIBRARY
-// Define a weak main which would be used on target side in case usere's
-// source file containing main does not have offload code.
-#pragma weak main
-int main(void)
-{
-    OFFLOAD_TARGET_MAIN();
-    return 0;
-}
-
-#pragma weak MAIN__
-extern "C" int MAIN__(void)
-{
-    OFFLOAD_TARGET_MAIN();
-    return 0;
-}
-#endif // HOST_LIBRARY
-
-// offload section prolog
-ALLOCATE(OFFLOAD_ENTRY_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_entry_table_start = { 0 };
-
-// list element for the current module
-static FuncList::Node __offload_entry_node = {
-    { &__offload_entry_table_start + 1, -1 },
-    0, 0
-};
-
-// offload fp section prolog
-ALLOCATE(OFFLOAD_FUNC_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_func_table_start = { 0 };
-
-// list element for the current module
-static FuncList::Node __offload_func_node = {
-    { &__offload_func_table_start + 1, -1 },
-    0, 0
-};
-
-// offload fp section prolog
-ALLOCATE(OFFLOAD_VAR_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(VarTable::Entry)))
-#endif // TARGET_WINNT
-static VarTable::Entry __offload_var_table_start = { 0 };
-
-// list element for the current module
-static VarList::Node __offload_var_node = {
-    { &__offload_var_table_start + 1 },
-    0, 0
-};
-
-#ifdef MYO_SUPPORT
-
-// offload myo shared var section prolog
-ALLOCATE(OFFLOAD_MYO_SHARED_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(SharedTableEntry)))
-#endif // TARGET_WINNT
-static SharedTableEntry __offload_myo_shared_table_start = { 0 };
-
-#if HOST_LIBRARY
-// offload myo shared var init section prolog
-ALLOCATE(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(InitTableEntry)))
-#endif // TARGET_WINNT
-static InitTableEntry __offload_myo_shared_init_table_start = { 0 };
-#endif
-
-// offload myo fptr section prolog
-ALLOCATE(OFFLOAD_MYO_FPTR_TABLE_SECTION_START)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FptrTableEntry)))
-#endif // TARGET_WINNT
-static FptrTableEntry __offload_myo_fptr_table_start = { 0 };
-
-#endif // MYO_SUPPORT
-
-// init/fini code which adds/removes local lookup data to/from the global list
-
-static void offload_fini();
-
-#ifndef TARGET_WINNT
-static void offload_init() __attribute__((constructor(101)));
-#else // TARGET_WINNT
-static void offload_init();
-
-// Place offload initialization before user constructors
-ALLOCATE(OFFLOAD_CRTINIT_SECTION_START)
-static void (*addressof_offload_init)() = offload_init;
-#endif // TARGET_WINNT
-
-static void offload_init()
-{
-    // register offload tables
-    __offload_register_tables(&__offload_entry_node,
-                              &__offload_func_node,
-                              &__offload_var_node);
-
-#if HOST_LIBRARY
-    __offload_register_image(&__offload_target_image);
-    atexit(offload_fini);
-#endif // HOST_LIBRARY
-
-#ifdef MYO_SUPPORT
-    __offload_myoRegisterTables(
-#if HOST_LIBRARY
-        &__offload_myo_shared_init_table_start + 1,
-#endif // HOST_LIBRARY
-        &__offload_myo_shared_table_start + 1,
-        &__offload_myo_fptr_table_start + 1
-    );
-#endif // MYO_SUPPORT
-}
-
-static void offload_fini()
-{
-#if HOST_LIBRARY
-    __offload_unregister_image(&__offload_target_image);
-#endif // HOST_LIBRARY
-
-    // unregister offload tables
-    __offload_unregister_tables(&__offload_entry_node,
-                                &__offload_func_node,
-                                &__offload_var_node);
-}
diff --git a/offload/src/ofldend.cpp b/offload/src/ofldend.cpp
deleted file mode 100644
index f61fe59d1..000000000
--- a/offload/src/ofldend.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#if HOST_LIBRARY
-#include "offload_host.h"
-#include "offload_myo_host.h"
-#else
-#include "offload_target.h"
-#include "offload_myo_target.h"
-#endif
-
-#ifdef TARGET_WINNT
-#define ALLOCATE(name) __declspec(allocate(name))
-#else // TARGET_WINNT
-#define ALLOCATE(name) __attribute__((section(name)))
-#endif // TARGET_WINNT
-
-// offload entry table
-ALLOCATE(OFFLOAD_ENTRY_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_entry_table_end = { (const char*)-1 };
-
-// offload function table
-ALLOCATE(OFFLOAD_FUNC_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FuncTable::Entry)))
-#endif // TARGET_WINNT
-static FuncTable::Entry __offload_func_table_end = { (const char*)-1 };
-
-// data table
-ALLOCATE(OFFLOAD_VAR_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(VarTable::Entry)))
-#endif // TARGET_WINNT
-static VarTable::Entry __offload_var_table_end = { (const char*)-1 };
-
-#ifdef MYO_SUPPORT
-
-// offload myo shared var section epilog
-ALLOCATE(OFFLOAD_MYO_SHARED_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(SharedTableEntry)))
-static SharedTableEntry __offload_myo_shared_table_end = { (const char*)-1, 0 };
-#else // TARGET_WINNT
-static SharedTableEntry __offload_myo_shared_table_end = { 0 };
-#endif // TARGET_WINNT
-
-#if HOST_LIBRARY
-// offload myo shared var init section epilog
-ALLOCATE(OFFLOAD_MYO_SHARED_INIT_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(InitTableEntry)))
-static InitTableEntry __offload_myo_shared_init_table_end = { (const char*)-1, 0 };
-#else // TARGET_WINNT
-static InitTableEntry __offload_myo_shared_init_table_end = { 0 };
-#endif // TARGET_WINNT
-#endif // HOST_LIBRARY
-
-// offload myo fptr section epilog
-ALLOCATE(OFFLOAD_MYO_FPTR_TABLE_SECTION_END)
-#ifdef TARGET_WINNT
-__declspec(align(sizeof(FptrTableEntry)))
-static FptrTableEntry __offload_myo_fptr_table_end = { (const char*)-1, 0, 0 };
-#else // TARGET_WINNT
-static FptrTableEntry __offload_myo_fptr_table_end = { 0 };
-#endif // TARGET_WINNT
-
-#endif // MYO_SUPPORT
diff --git a/offload/src/orsl-lite/include/orsl-lite.h b/offload/src/orsl-lite/include/orsl-lite.h
deleted file mode 100644
index f26a33511..000000000
--- a/offload/src/orsl-lite/include/orsl-lite.h
+++ /dev/null
@@ -1,221 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef _ORSL_LITE_H_
-#define _ORSL_LITE_H_
-
-#ifndef TARGET_WINNT
-#include <sched.h>
-#else
-#define cpu_set_t int
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/** Type of a ORSLBusySet */
-typedef enum ORSLBusySetType {
-    BUSY_SET_EMPTY = 0,     /**< Empty set */
-    BUSY_SET_PARTIAL = 1,   /**< Non-empty set that omits some threads */
-    BUSY_SET_FULL = 2       /**< A set that includes all threads on the card */
-} BusySetType;
-
-/** ORSLBusySet encapsulation */
-typedef struct ORSLBusySet {
-    BusySetType type;   /**< Set type */
-#ifdef __linux__
-    cpu_set_t cpu_set;  /**< CPU mask (unused for BUSY_SET_EMPTY and
-                           BUSY_SET_PARTIAL sets) represented by the standard
-                           Linux CPU set type -- cpu_set_t. Threads are numbered
-                           starting from 0. The maximal possible thread number
-                           is system-specific. See CPU_SET(3) family of macros
-                           for more details. Unused in ORSL Lite. */
-#endif
-} ORSLBusySet;
-
-/** Client tag */
-typedef char* ORSLTag;
-
-/** Maximal length of tag in characters */
-#define ORSL_MAX_TAG_LEN 128
-
-/** Maximal number of cards that can be managed by ORSL */
-#define ORSL_MAX_CARDS 32
-
-/** Reserves computational resources on a set of cards. Blocks.
- *
- * If any of the resources cannot be reserved, this function will block until
- * they become available. Reservation can be recursive if performed by the
- * same tag. A recursively reserved resource must be released the same number
- * of times it was reserved.
- *
- * @see ORSLTryReserve
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds   Indices of the cards: an integer array with n elements.
- *                    Cannot be NULL if n > 0. Valid card indices are from 0
- *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
- *                    if n > 0.
- *
- * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                    must not exeed ORSL_MAX_TAG_LEN.
- *
- * @returns    0      if the resources were successfully reserved
- *
- * @returns    EINVAL if any of the arguments is invalid
- *
- * @returns    EAGAIN limit of recursive reservations reached
- *                    (not in ORSL Lite)
- *
- * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                    equal to BUSY_SET_PARTIAL
- */
-int ORSLReserve(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag);
-
-/** Reserves computational resources on a set of cards. Does not block.
- *
- * If any of the resources cannot be reserved, this function will return
- * immediately. Reservation can be recursive if performed by the same tag.
- * A recursively reserved resource must be released the same number of times
- * it was reserved.
- *
- * @see ORSLReserve
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds     Indices of the cards: an integer array with n elements.
- *                      Cannot be NULL if n > 0. Valid card indices are from 0
- *                      to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[inout] bsets  Requested resources on each of the card. Cannot be
- *                      NULL if n > 0.
- *
- * @param[in]    tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                      must not exceed ORSL_MAX_TAG_LEN.
- *
- * @returns      0      if the resources were successfully reserved
- *
- * @returns      EBUSY  if some of the requested resources are busy
- *
- * @returns      EINVAL if any of the arguments is invalid
- *
- * @returns      EAGAIN limit of recursive reservations reached
- *                      (not in ORSL Lite)
- *
- * @returns      ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                      equal to BUSY_SET_PARTIAL
- */
-int ORSLTryReserve(const int n, const int *__restrict inds,
-                   const ORSLBusySet *__restrict bsets,
-                   const ORSLTag __restrict tag);
-
-/** Granularify of partial reservation */
-typedef enum ORSLPartialGranularity {
-    GRAN_CARD = 0, /**< Card granularity */
-    GRAN_THREAD = 1 /**< Thread granularity */
-} ORSLPartialGranularity;
-
-/** Requests reservation of some of computational resources on a set of cards.
- * Does not block. Updates user-provided bsets to indicate which resources
- * were reserved.
- *
- * If any of the resources cannot be reserved, this function will update busy
- * sets provided by the caller to reflect what resources were actually
- * reserved. This function supports two granularity modes: 'card' and
- * 'thread'.  When granularity is set to 'card', a failure to reserve a thread
- * on the card will imply that reservation has failed for the whole card. When
- * granularity is set to 'thread', reservation on a card will be considered
- * successful as long as at least one thread on the card was successfully
- * reserved. Reservation can be recursive if performed by the same tag. A
- * recursively reserved resource must be released the same number of times it
- * was reserved.
- *
- * @param[in]  gran   Reservation granularity
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds   Indices of the cards: an integer array with n elements.
- *                    Cannot be NULL if n > 0. Valid card indices are from 0
- *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
- *                    if n > 0.
- *
- * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                    must not exceed ORSL_MAX_TAG_LEN.
- *
- * @returns    0      if at least some of the resources were successfully
- *                    reserved
- *
- * @returns    EBUSY  if all of the requested resources are busy
- *
- * @returns    EINVAL if any of the arguments is invalid
- *
- * @returns    EAGAIN limit of recursive reservations reached
- *                    (not in ORSL Lite)
- *
- * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                    equal to BUSY_SET_PARTIAL
- */
-int ORSLReservePartial(const ORSLPartialGranularity gran, const int n,
-                       const int *__restrict inds,
-                       ORSLBusySet *__restrict bsets,
-                       const ORSLTag __restrict tag);
-
-/** Releases previously reserved computational resources on a set of cards.
- *
- * This function will fail if any of the resources to be released were not
- * reserved by the calling client.
- *
- * @see ORSLReserve
- * @see ORSLTryReserve
- * @see ORSLReservePartial
- *
- * @param[in]  n      Number of cards to reserve resources on. Cannot be < 0
- *                    or > ORSL_MAX_CARDS.
- *
- * @param[in]  inds   Indices of the cards: an integer array with n elements.
- *                    Cannot be NULL if n > 0. Valid card indices are from 0
- *                    to ORSL_MAX_CARDS-1. Cannot contain duplicate elements.
- *
- * @param[in]  bsets  Requested resources on each of the card. Cannot be NULL
- *                    if n > 0.
- *
- * @param[in]  tag    ORSLTag of the calling client. Cannot be NULL. Length
- *                    must not exceed ORSL_MAX_TAG_LEN.
- *
- * @returns    0      if the resources were successfully released
- *
- * @returns    EINVAL if any of the arguments is invalid
- *
- * @returns    EPERM  the calling client did not reserve some of the
- *                    resources it is trying to release.
- *
- * @returns    ENOSYS (in ORSL Lite) if type of any of the busy sets is
- *                    equal to BUSY_SET_PARTIAL
- */
-int ORSLRelease(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/offload/src/orsl-lite/lib/orsl-lite.c b/offload/src/orsl-lite/lib/orsl-lite.c
deleted file mode 100644
index 221cda713..000000000
--- a/offload/src/orsl-lite/lib/orsl-lite.c
+++ /dev/null
@@ -1,337 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-
-#include <errno.h>
-#include <string.h>
-#include <limits.h>
-#include <assert.h>
-
-#include "orsl-lite/include/orsl-lite.h"
-
-#define DISABLE_SYMBOL_VERSIONING
-
-#if defined(__linux__) && !defined(DISABLE_SYMBOL_VERSIONING)
-#define symver(src, tgt, verstr) __asm__(".symver " #src "," #tgt verstr)
-symver(ORSLReserve0, ORSLReserve, "@@ORSL_0.0");
-symver(ORSLTryReserve0, ORSLTryReserve, "@@ORSL_0.0");
-symver(ORSLReservePartial0, ORSLReservePartial, "@@ORSL_0.0");
-symver(ORSLRelease0, ORSLRelease, "@@ORSL_0.0");
-#else
-#define ORSLReserve0 ORSLReserve
-#define ORSLTryReserve0 ORSLTryReserve
-#define ORSLReservePartial0 ORSLReservePartial
-#define ORSLRelease0 ORSLRelease
-#endif
-
-#ifdef __linux__
-#include <pthread.h>
-static pthread_mutex_t global_mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t release_cond = PTHREAD_COND_INITIALIZER;
-#endif
-
-#ifdef _WIN32
-#include <windows.h>
-#pragma intrinsic(_ReadWriteBarrier)
-static SRWLOCK global_mutex = SRWLOCK_INIT;
-static volatile int release_cond_initialized = 0;
-static CONDITION_VARIABLE release_cond;
-
-static void state_lazy_init_sync()
-{
-    if (!release_cond_initialized) {
-        AcquireSRWLockExclusive(&global_mutex);
-        _ReadWriteBarrier();
-        if (!release_cond_initialized) {
-            InitializeConditionVariable(&release_cond);
-            release_cond_initialized = 1;
-        }
-        ReleaseSRWLockExclusive(&global_mutex);
-    }
-}
-#endif
-
-static int state_lock()
-{
-#ifdef __linux__
-    return pthread_mutex_lock(&global_mutex);
-#endif
-
-#ifdef _WIN32
-    AcquireSRWLockExclusive(&global_mutex);
-    return 0;
-#endif
-}
-
-static int state_unlock()
-{
-#ifdef __linux__
-    return pthread_mutex_unlock(&global_mutex);
-#endif
-
-#ifdef _WIN32
-    ReleaseSRWLockExclusive(&global_mutex);
-    return 0;
-#endif
-}
-
-static int state_wait_for_release()
-{
-#ifdef __linux__
-    return pthread_cond_wait(&release_cond, &global_mutex);
-#endif
-
-#ifdef _WIN32
-    return SleepConditionVariableSRW(&release_cond,
-            &global_mutex, INFINITE, 0) == 0 ? 1 : 0;
-#endif
-}
-
-static int state_signal_release()
-{
-#ifdef __linux__
-    return pthread_cond_signal(&release_cond);
-#endif
-
-#ifdef _WIN32
-    WakeConditionVariable(&release_cond);
-    return 0;
-#endif
-}
-
-static struct {
-    char owner[ORSL_MAX_TAG_LEN + 1];
-    unsigned long rsrv_cnt;
-} rsrv_data[ORSL_MAX_CARDS];
-
-static int check_args(const int n, const int *__restrict inds,
-                      const ORSLBusySet *__restrict bsets,
-                      const ORSLTag __restrict tag)
-{
-    int i;
-    int card_specified[ORSL_MAX_CARDS];
-    if (tag == NULL) return -1;
-    if (strlen((char *)tag) > ORSL_MAX_TAG_LEN) return -1;
-    if (n < 0 || n >= ORSL_MAX_CARDS) return -1;
-    if (n != 0 && (inds == NULL || bsets == NULL)) return -1;
-    for (i = 0; i < ORSL_MAX_CARDS; i++)
-        card_specified[i] = 0;
-    for (i = 0; i < n; i++) {
-        int ind = inds[i];
-        if (ind < 0 || ind >= ORSL_MAX_CARDS) return -1;
-        if (card_specified[ind]) return -1;
-        card_specified[ind] = 1;
-    }
-    return 0;
-}
-
-static int check_bsets(const int n, const ORSLBusySet *bsets)
-{
-    int i;
-    for (i = 0; i < n; i++)
-        if (bsets[i].type == BUSY_SET_PARTIAL) return -1;
-    return 0;
-}
-
-static int can_reserve_card(int card, const ORSLBusySet *__restrict bset,
-                            const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    return (bset->type == BUSY_SET_EMPTY ||
-            ((rsrv_data[card].rsrv_cnt == 0 ||
-            strncmp((char *)tag,
-                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0) &&
-            rsrv_data[card].rsrv_cnt < ULONG_MAX)) ? 0 : - 1;
-}
-
-static void reserve_card(int card, const ORSLBusySet *__restrict bset,
-                         const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    if (bset->type == BUSY_SET_EMPTY)
-        return;
-
-    assert(rsrv_data[card].rsrv_cnt == 0 ||
-            strncmp((char *)tag,
-                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0);
-    assert(rsrv_data[card].rsrv_cnt < ULONG_MAX);
-
-    if (rsrv_data[card].rsrv_cnt == 0)
-        strncpy(rsrv_data[card].owner, (char *)tag, ORSL_MAX_TAG_LEN);
-    rsrv_data[card].owner[ORSL_MAX_TAG_LEN] = '\0';
-    rsrv_data[card].rsrv_cnt++;
-}
-
-static int can_release_card(int card, const ORSLBusySet *__restrict bset,
-                            const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    return (bset->type == BUSY_SET_EMPTY || (rsrv_data[card].rsrv_cnt > 0 &&
-                strncmp((char *)tag,
-                    rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0)) ? 0 : 1;
-}
-
-static void release_card(int card, const ORSLBusySet *__restrict bset,
-                         const ORSLTag __restrict tag)
-{
-    assert(tag != NULL);
-    assert(bset != NULL);
-    assert(strlen((char *)tag) < ORSL_MAX_TAG_LEN);
-    assert(bset->type != BUSY_SET_PARTIAL);
-
-    if (bset->type == BUSY_SET_EMPTY)
-        return;
-
-    assert(strncmp((char *)tag,
-                rsrv_data[card].owner, ORSL_MAX_TAG_LEN) == 0);
-    assert(rsrv_data[card].rsrv_cnt > 0);
-
-    rsrv_data[card].rsrv_cnt--;
-}
-
-int ORSLReserve0(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag)
-{
-    int i, ok;
-
-    if (n == 0) return 0;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Loop until we find that all the resources we want are available */
-    do {
-        ok = 1;
-        for (i = 0; i < n; i++)
-            if (can_reserve_card(inds[i], &bsets[i], tag) != 0) {
-                ok = 0;
-                /* Wait for someone to release some resources */
-                state_wait_for_release();
-                break;
-            }
-    } while (!ok);
-
-    /* At this point we are good to reserve_card the resources we want */
-    for (i = 0; i < n; i++)
-        reserve_card(inds[i], &bsets[i], tag);
-
-    state_unlock();
-    return 0;
-}
-
-int ORSLTryReserve0(const int n, const int *__restrict inds,
-                   const ORSLBusySet *__restrict bsets,
-                   const ORSLTag __restrict tag)
-{
-    int i, rc = EBUSY;
-
-    if (n == 0) return 0;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Check resource availability once */
-    for (i = 0; i < n; i++)
-        if (can_reserve_card(inds[i], &bsets[i], tag) != 0)
-            goto bail_out;
-
-    /* At this point we are good to reserve the resources we want */
-    for (i = 0; i < n; i++)
-        reserve_card(inds[i], &bsets[i], tag);
-
-    rc = 0;
-
-bail_out:
-    state_unlock();
-    return rc;
-}
-
-int ORSLReservePartial0(const ORSLPartialGranularity gran, const int n,
-                       const int *__restrict inds, ORSLBusySet *__restrict bsets,
-                       const ORSLTag __restrict tag)
-{
-    int rc = EBUSY;
-    int i, num_avail = n;
-
-    if (n == 0) return 0;
-    if (gran != GRAN_CARD && gran != GRAN_THREAD) return EINVAL;
-    if (gran != GRAN_CARD) return EINVAL;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Check resource availability once; remove unavailable resources from the
-     * user-provided list */
-    for (i = 0; i < n; i++)
-        if (can_reserve_card(inds[i], &bsets[i], tag) != 0) {
-            num_avail--;
-            bsets[i].type = BUSY_SET_EMPTY;
-        }
-
-    if (num_avail == 0)
-        goto bail_out;
-
-    /* At this point we are good to reserve the resources we want */
-    for (i = 0; i < n; i++)
-        reserve_card(inds[i], &bsets[i], tag);
-
-    rc = 0;
-
-bail_out:
-    state_unlock();
-    return rc;
-}
-
-int ORSLRelease0(const int n, const int *__restrict inds,
-                const ORSLBusySet *__restrict bsets,
-                const ORSLTag __restrict tag)
-{
-    int i, rc = EPERM;
-
-    if (n == 0) return 0;
-    if (check_args(n, inds, bsets, tag) != 0) return EINVAL;
-    if (check_bsets(n, bsets) != 0) return ENOSYS;
-
-    state_lock();
-
-    /* Check that we can release all the resources */
-    for (i = 0; i < n; i++)
-        if (can_release_card(inds[i], &bsets[i], tag) != 0)
-            goto bail_out;
-
-    /* At this point we are good to release the resources we want */
-    for (i = 0; i < n; i++)
-        release_card(inds[i], &bsets[i], tag);
-
-    state_signal_release();
-
-    rc = 0;
-
-bail_out:
-    state_unlock();
-    return rc;
-}
-
-/* vim:set et: */
diff --git a/offload/src/orsl-lite/version.txt b/offload/src/orsl-lite/version.txt
deleted file mode 100644
index ab5f59930..000000000
--- a/offload/src/orsl-lite/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-ORSL-lite 0.7
diff --git a/offload/src/rdtsc.h b/offload/src/rdtsc.h
deleted file mode 100644
index da91d7132..000000000
--- a/offload/src/rdtsc.h
+++ /dev/null
@@ -1,17 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-uint64_t _rdtsc()
-{
-  uint32_t eax, edx;
-  asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
-  return ((uint64_t)edx << 32) | eax;
-}
diff --git a/offload/src/use_mpss2.txt b/offload/src/use_mpss2.txt
deleted file mode 100644
index 948f4838f..000000000
--- a/offload/src/use_mpss2.txt
+++ /dev/null
@@ -1 +0,0 @@
-2.1.6720-13
diff --git a/offload/src/use_mpss_win.txt b/offload/src/use_mpss_win.txt
deleted file mode 100644
index 948f4838f..000000000
--- a/offload/src/use_mpss_win.txt
+++ /dev/null
@@ -1 +0,0 @@
-2.1.6720-13
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 9e828bb10..d9524b060 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -339,6 +339,10 @@ if(LIBOMP_USE_HWLOC AND (NOT LIBOMP_HAVE_HWLOC))
   libomp_error_say("Hwloc requested but not available")
 endif()
 
+# Hierarchical scheduling support
+set(LIBOMP_USE_HIER_SCHED FALSE CACHE BOOL
+  "Hierarchical scheduling support?")
+
 # Setting final library name
 set(LIBOMP_DEFAULT_LIB_NAME libomp)
 if(${PROFILE_LIBRARY})
diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt
index 7db4363e6..1d4e0d295 100644
--- a/runtime/src/CMakeLists.txt
+++ b/runtime/src/CMakeLists.txt
@@ -307,22 +307,22 @@ if(${OPENMP_STANDALONE_BUILD})
   set(LIBOMP_HEADERS_INSTALL_PATH include)
 else()
   string(REGEX MATCH "[0-9]+\\.[0-9]+(\\.[0-9]+)?" CLANG_VERSION ${PACKAGE_VERSION})
-  set(LIBOMP_HEADERS_INSTALL_PATH lib${OPENMP_LIBDIR_SUFFIX}/clang/${CLANG_VERSION}/include)
+  set(LIBOMP_HEADERS_INSTALL_PATH "${OPENMP_INSTALL_LIBDIR}/clang/${CLANG_VERSION}/include")
 endif()
 if(WIN32)
   install(TARGETS omp RUNTIME DESTINATION bin)
-  install(TARGETS ompimp ARCHIVE DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
+  install(TARGETS ompimp ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}")
   # Create aliases (regular copies) of the library for backwards compatibility
   set(LIBOMP_ALIASES "libiomp5md")
   foreach(alias IN LISTS LIBOMP_ALIASES)
     install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_LIB_FILE}\"
       \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/bin)")
     install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E copy \"${LIBOMP_IMP_LIB_FILE}\"
-      \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/lib${OPENMP_LIBDIR_SUFFIX})")
+      \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY \${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
   endforeach()
 else()
 
-  install(TARGETS omp ${LIBOMP_INSTALL_KIND} DESTINATION lib${OPENMP_LIBDIR_SUFFIX})
+  install(TARGETS omp ${LIBOMP_INSTALL_KIND} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
 
   if(${LIBOMP_INSTALL_ALIASES})
     # Create aliases (symlinks) of the library for backwards compatibility
@@ -330,7 +330,7 @@ else()
     foreach(alias IN LISTS LIBOMP_ALIASES)
       install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${LIBOMP_LIB_FILE}\"
         \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
-        \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/lib${OPENMP_LIBDIR_SUFFIX})")
+        \$ENV{DESTDIR}\${CMAKE_INSTALL_PREFIX}/${OPENMP_INSTALL_LIBDIR})")
     endforeach()
   endif()
 endif()
diff --git a/runtime/src/i18n/en_US.txt b/runtime/src/i18n/en_US.txt
index 6329374c0..6882b3a6f 100644
--- a/runtime/src/i18n/en_US.txt
+++ b/runtime/src/i18n/en_US.txt
@@ -423,6 +423,7 @@ AffHWSubsetNoHWLOC           "KMP_HW_SUBSET ignored: unsupported item requested
 AffHWSubsetManyNodes         "KMP_HW_SUBSET ignored: too many NUMA Nodes requested."
 AffHWSubsetManyTiles         "KMP_HW_SUBSET ignored: too many L2 Caches requested."
 AffHWSubsetManyProcs         "KMP_HW_SUBSET ignored: too many Procs requested."
+HierSchedInvalid             "Hierarchy ignored: unsupported level: %1$s."
 
 
 # --------------------------------------------------------------------------------------------------
diff --git a/runtime/src/include/50/ompt.h.var b/runtime/src/include/50/ompt.h.var
index ffbb9c63c..21b4c4635 100644
--- a/runtime/src/include/50/ompt.h.var
+++ b/runtime/src/include/50/ompt.h.var
@@ -173,19 +173,19 @@ typedef union ompt_data_t {
 
 static const ompt_data_t ompt_data_none = {0};
 
-typedef uint64_t ompt_wait_id_t;
-static const ompt_wait_id_t ompt_wait_id_none = 0;
+typedef uint64_t omp_wait_id_t;
+static const omp_wait_id_t omp_wait_id_none = 0;
 
 typedef void ompt_device_t;
 
 /*---------------------
- * ompt_frame_t
+ * omp_frame_t
  *---------------------*/
 
-typedef struct ompt_frame_t {
+typedef struct omp_frame_t {
     void *exit_frame;    /* next frame is user code     */
     void *enter_frame;   /* previous frame is user code */
-} ompt_frame_t;
+} omp_frame_t;
 
 
 /*---------------------
@@ -287,7 +287,7 @@ typedef void (*ompt_callback_thread_end_t) (
 );
 
 typedef void (*ompt_wait_callback_t) (
-    ompt_wait_id_t wait_id                /* wait data                           */
+    omp_wait_id_t wait_id                /* wait data                           */
 );
 
 /* parallel and workshares */
@@ -308,7 +308,7 @@ typedef void (*ompt_callback_implicit_task_t) (
 
 typedef void (*ompt_callback_parallel_begin_t) (
     ompt_data_t *encountering_task_data,         /* data of encountering task           */
-    const ompt_frame_t *encountering_task_frame, /* frame data of encountering task     */
+    const omp_frame_t *encountering_task_frame,  /* frame data of encountering task     */
     ompt_data_t *parallel_data,                  /* data of parallel region             */
     unsigned int requested_team_size,            /* requested number of threads in team */
     ompt_invoker_t invoker,                      /* invoker of master task              */
@@ -350,7 +350,7 @@ typedef void (*ompt_callback_task_schedule_t) (
 
 typedef void (*ompt_callback_task_create_t) (
     ompt_data_t *encountering_task_data,         /* data of parent task                 */
-    const ompt_frame_t *encountering_task_frame, /* frame data for parent task          */
+    const omp_frame_t *encountering_task_frame,  /* frame data for parent task          */
     ompt_data_t *new_task_data,                  /* data of created task                */
     int type,                                    /* type of created task                */
     int has_dependences,                         /* created task has dependences        */
@@ -467,19 +467,19 @@ typedef void (*ompt_callback_mutex_acquire_t) (
     ompt_mutex_kind_t kind,               /* mutex kind                          */
     unsigned int hint,                    /* mutex hint                          */
     unsigned int impl,                    /* mutex implementation                */
-    ompt_wait_id_t wait_id,               /* id of object being awaited          */
+    omp_wait_id_t wait_id,               /* id of object being awaited          */
     const void *codeptr_ra                /* return address of runtime call      */
 );
 
 typedef void (*ompt_callback_mutex_t) (
     ompt_mutex_kind_t kind,               /* mutex kind                          */
-    ompt_wait_id_t wait_id,               /* id of object being awaited          */
+    omp_wait_id_t wait_id,               /* id of object being awaited          */
     const void *codeptr_ra                /* return address of runtime call      */
 );
 
 typedef void (*ompt_callback_nest_lock_t) (
     ompt_scope_endpoint_t endpoint,       /* endpoint of nested lock             */
-    ompt_wait_id_t wait_id,               /* id of object being awaited          */
+    omp_wait_id_t wait_id,               /* id of object being awaited          */
     const void *codeptr_ra                /* return address of runtime call      */
 );
 
@@ -569,7 +569,7 @@ extern "C" {
 
 /* state */
 OMPT_API_FUNCTION(omp_state_t, ompt_get_state, (
-    ompt_wait_id_t *wait_id
+    omp_wait_id_t *wait_id
 ));
 
 /* thread */
@@ -587,7 +587,7 @@ OMPT_API_FUNCTION(int, ompt_get_task_info, (
     int ancestor_level,
     int *type,
     ompt_data_t **task_data,
-    ompt_frame_t **task_frame,
+    omp_frame_t **task_frame,
     ompt_data_t **parallel_data,
     int *thread_num
 ));
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index ae4f40089..8d156e6e7 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -83,6 +83,12 @@
 class kmp_stats_list;
 #endif
 
+#if KMP_USE_HIER_SCHED
+// Only include hierarchical scheduling if affinity is supported
+#undef KMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED KMP_AFFINITY_SUPPORTED
+#endif
+
 #if KMP_USE_HWLOC && KMP_AFFINITY_SUPPORTED
 #include "hwloc.h"
 #ifndef HWLOC_OBJ_NUMANODE
@@ -260,6 +266,12 @@ extern "C" {
     while (*(_x) >= '0' && *(_x) <= '9')                                       \
       (_x)++;                                                                  \
   }
+#define SKIP_TOKEN(_x)                                                         \
+  {                                                                            \
+    while ((*(_x) >= '0' && *(_x) <= '9') || (*(_x) >= 'a' && *(_x) <= 'z') || \
+           (*(_x) >= 'A' && *(_x) <= 'Z') || *(_x) == '_')                     \
+      (_x)++;                                                                  \
+  }
 #define SKIP_TO(_x, _c)                                                        \
   {                                                                            \
     while (*(_x) != '\0' && *(_x) != (_c))                                     \
@@ -944,7 +956,7 @@ extern int __kmp_hws_abs_flag; // absolute or per-item number requested
 // HW TSC is used to reduce overhead (clock tick instead of nanosecond).
 extern kmp_uint64 __kmp_ticks_per_msec;
 #if KMP_COMPILER_ICC
-#define KMP_NOW() _rdtsc()
+#define KMP_NOW() ((kmp_uint64)_rdtsc())
 #else
 #define KMP_NOW() __kmp_hardware_timestamp()
 #endif
@@ -1512,6 +1524,30 @@ struct shared_table {
 
 /* ------------------------------------------------------------------------ */
 
+#if KMP_USE_HIER_SCHED
+// Shared barrier data that exists inside a single unit of the scheduling
+// hierarchy
+typedef struct kmp_hier_private_bdata_t {
+  kmp_int32 num_active;
+  kmp_uint64 index;
+  kmp_uint64 wait_val[2];
+} kmp_hier_private_bdata_t;
+#endif
+
+typedef struct kmp_sched_flags {
+  unsigned ordered : 1;
+  unsigned nomerge : 1;
+  unsigned contains_last : 1;
+#if KMP_USE_HIER_SCHED
+  unsigned use_hier : 1;
+  unsigned unused : 28;
+#else
+  unsigned unused : 29;
+#endif
+} kmp_sched_flags_t;
+
+KMP_BUILD_ASSERT(sizeof(kmp_sched_flags_t) == 4);
+
 #if KMP_STATIC_STEAL_ENABLED
 typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   kmp_int32 count;
@@ -1629,14 +1665,17 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info {
     dispatch_private_info64_t p64;
   } u;
   enum sched_type schedule; /* scheduling algorithm */
-  kmp_int32 ordered; /* ordered clause specified */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
   kmp_int32 ordered_bumped;
   // To retain the structure size after making ordered_iteration scalar
   kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
   // Stack of buffers for nest of serial regions
   struct dispatch_private_info *next;
-  kmp_int32 nomerge; /* don't merge iters if serialized */
   kmp_int32 type_size; /* the size of types in private_info */
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  void *parent; /* hierarchical scheduling parent pointer */
+#endif
   enum cons_type pushed_ws;
 } dispatch_private_info_t;
 
@@ -1671,6 +1710,9 @@ typedef struct dispatch_shared_info {
   volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1)
   kmp_int32 doacross_num_done; // count finished threads
 #endif
+#if KMP_USE_HIER_SCHED
+  void *hier;
+#endif
 #if KMP_USE_HWLOC
   // When linking with libhwloc, the ORDERED EPCC test slows down on big
   // machines (> 48 cores). Performance analysis showed that a cache thrash
@@ -2113,8 +2155,9 @@ typedef struct kmp_task { /* GEH: Shouldn't this be aligned somehow? */
 
 #if OMP_40_ENABLED
 typedef struct kmp_taskgroup {
-  kmp_int32 count; // number of allocated and not yet complete tasks
-  kmp_int32 cancel_request; // request for cancellation of this taskgroup
+  std::atomic<kmp_int32> count; // number of allocated and incomplete tasks
+  std::atomic<kmp_int32>
+      cancel_request; // request for cancellation of this taskgroup
   struct kmp_taskgroup *parent; // parent taskgroup
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
 #if OMP_45_ENABLED
@@ -2153,8 +2196,8 @@ typedef struct kmp_base_depnode {
   kmp_uint32 id;
 #endif
 
-  volatile kmp_int32 npredecessors;
-  volatile kmp_int32 nrefs;
+  std::atomic<kmp_int32> npredecessors;
+  std::atomic<kmp_int32> nrefs;
 } kmp_base_depnode_t;
 
 union KMP_ALIGN_CACHE kmp_depnode {
@@ -2246,7 +2289,7 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
   /* Currently not used except for perhaps IDB */
   kmp_taskdata_t *td_parent; /* parent task                             */
   kmp_int32 td_level; /* task nesting level                      */
-  kmp_int32 td_untied_count; /* untied task active parts counter        */
+  std::atomic<kmp_int32> td_untied_count; // untied task active parts counter
   ident_t *td_ident; /* task identifier                         */
   // Taskwait data.
   ident_t *td_taskwait_ident;
@@ -2254,10 +2297,10 @@ struct kmp_taskdata { /* aligned during dynamic allocation       */
   kmp_int32 td_taskwait_thread; /* gtid + 1 of thread encountered taskwait */
   KMP_ALIGN_CACHE kmp_internal_control_t
       td_icvs; /* Internal control variables for the task */
-  KMP_ALIGN_CACHE volatile kmp_int32
+  KMP_ALIGN_CACHE std::atomic<kmp_int32>
       td_allocated_child_tasks; /* Child tasks (+ current task) not yet
                                    deallocated */
-  volatile kmp_int32
+  std::atomic<kmp_int32>
       td_incomplete_child_tasks; /* Child tasks not yet complete */
 #if OMP_40_ENABLED
   kmp_taskgroup_t
@@ -2342,7 +2385,7 @@ typedef struct kmp_base_task_team {
   kmp_int32 tt_untied_task_encountered;
 
   KMP_ALIGN_CACHE
-  volatile kmp_int32 tt_unfinished_threads; /* #threads still active      */
+  std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
 
   KMP_ALIGN_CACHE
   volatile kmp_uint32
@@ -2484,6 +2527,10 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
   kmp_uint8 th_active_in_pool; // included in count of #active threads in pool
   int th_active; // ! sleeping; 32 bits for TCR/TCW
   struct cons_header *th_cons; // used for consistency check
+#if KMP_USE_HIER_SCHED
+  // used for hierarchical scheduling
+  kmp_hier_private_bdata_t *th_hier_bar_data;
+#endif
 
   /* Add the syncronizing data which is cache aligned and padded. */
   KMP_ALIGN_CACHE kmp_balign_t th_bar[bs_last_barrier];
@@ -2565,7 +2612,7 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
   // ---------------------------------------------------------------------------
   KMP_ALIGN_CACHE kmp_ordered_team_t t_ordered;
   kmp_balign_team_t t_bar[bs_last_barrier];
-  volatile int t_construct; // count of single directive encountered by team
+  std::atomic<int> t_construct; // count of single directive encountered by team
   char pad[sizeof(kmp_lock_t)]; // padding to maintain performance on big iron
 
   // Master only
@@ -2640,12 +2687,14 @@ typedef struct KMP_ALIGN_CACHE kmp_base_team {
 // for SERIALIZED teams nested 2 or more levels deep
 #if OMP_40_ENABLED
   // typed flag to store request state of cancellation
-  kmp_int32 t_cancel_request;
+  std::atomic<kmp_int32> t_cancel_request;
 #endif
   int t_master_active; // save on fork, restore on join
   kmp_taskq_t t_taskq; // this team's task queue
   void *t_copypriv_data; // team specific pointer to copyprivate data array
-  kmp_uint32 t_copyin_counter;
+#if KMP_OS_WINDOWS
+  std::atomic<kmp_uint32> t_copyin_counter;
+#endif
 #if USE_ITT_BUILD
   void *t_stack_id; // team specific stack stitching id (for ittnotify)
 #endif /* USE_ITT_BUILD */
@@ -2689,7 +2738,8 @@ typedef struct kmp_base_root {
   volatile int r_active; /* TRUE if some region in a nest has > 1 thread */
   // GEH: This is misnamed, should be r_in_parallel
   volatile int r_nested; // TODO: GEH - This is unused, just remove it entirely.
-  int r_in_parallel; /* keeps a count of active parallel regions per root */
+  // keeps a count of active parallel regions per root
+  std::atomic<int> r_in_parallel;
   // GEH: This is misnamed, should be r_active_levels
   kmp_team_t *r_root_team;
   kmp_team_t *r_hot_team;
@@ -2746,8 +2796,8 @@ extern int __kmp_debug_buf_atomic; /* TRUE means use atomic update of buffer
                                       entry pointer */
 
 extern char *__kmp_debug_buffer; /* Debug buffer itself */
-extern int __kmp_debug_count; /* Counter for number of lines printed in buffer
-                                 so far */
+extern std::atomic<int> __kmp_debug_count; /* Counter for number of lines
+                                              printed in buffer so far */
 extern int __kmp_debug_buf_warn_chars; /* Keep track of char increase
                                           recommended in warnings */
 /* end rotating debug buffer */
@@ -3004,7 +3054,7 @@ extern volatile int __kmp_nth;
    threads, and those in the thread pool */
 extern volatile int __kmp_all_nth;
 extern int __kmp_thread_pool_nth;
-extern volatile int __kmp_thread_pool_active_nth;
+extern std::atomic<int> __kmp_thread_pool_active_nth;
 
 extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
 /* end data protected by fork/join lock */
@@ -3013,14 +3063,14 @@ extern kmp_root_t **__kmp_root; /* root of thread hierarchy */
 extern kmp_global_t __kmp_global; /* global status */
 
 extern kmp_info_t __kmp_monitor;
-extern volatile kmp_uint32 __kmp_team_counter; // For Debugging Support Library
-extern volatile kmp_uint32 __kmp_task_counter; // For Debugging Support Library
+// For Debugging Support Library
+extern std::atomic<kmp_uint32> __kmp_team_counter;
+// For Debugging Support Library
+extern std::atomic<kmp_uint32> __kmp_task_counter;
 
 #if USE_DEBUGGER
-
 #define _KMP_GEN_ID(counter)                                                   \
-  (__kmp_debugging ? KMP_TEST_THEN_INC32((volatile kmp_int32 *)&counter) + 1   \
-                   : ~0)
+  (__kmp_debugging ? KMP_ATOMIC_INC(&counter) + 1 : ~0)
 #else
 #define _KMP_GEN_ID(counter) (~0)
 #endif /* USE_DEBUGGER */
diff --git a/runtime/src/kmp_affinity.cpp b/runtime/src/kmp_affinity.cpp
index b7da8d4f8..0ccbb4560 100644
--- a/runtime/src/kmp_affinity.cpp
+++ b/runtime/src/kmp_affinity.cpp
@@ -17,6 +17,9 @@
 #include "kmp_io.h"
 #include "kmp_str.h"
 #include "kmp_wrapper_getpid.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 // Store the real or imagined machine hierarchy here
 static hierarchy_info machine_hierarchy;
@@ -1895,6 +1898,76 @@ static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
   return 0;
 }
 
+#if KMP_USE_HIER_SCHED
+// Set the array sizes for the hierarchy layers
+static void __kmp_dispatch_set_hierarchy_values() {
+  // Set the maximum number of L1's to number of cores
+  // Set the maximum number of L2's to to either number of cores / 2 for
+  // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
+  // Or the number of cores for Intel(R) Xeon(R) processors
+  // Set the maximum number of NUMA nodes and L3's to number of packages
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
+  __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
+  // Set the number of threads per unit
+  // Number of hardware threads per L1/L2/L3/NUMA/LOOP
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
+      __kmp_nThreadsPerCore;
+#if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+  if (__kmp_mic_type >= mic3)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        2 * __kmp_nThreadsPerCore;
+  else
+#endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
+    __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
+        __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
+      nCoresPerPkg * __kmp_nThreadsPerCore;
+  __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
+      nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
+}
+
+// Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
+// i.e., this thread's L1 or this thread's L2, etc.
+int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
+  int index = type + 1;
+  int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
+  if (type == kmp_hier_layer_e::LAYER_THREAD)
+    return tid;
+  else if (type == kmp_hier_layer_e::LAYER_LOOP)
+    return 0;
+  KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
+  if (tid >= num_hw_threads)
+    tid = tid % num_hw_threads;
+  return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
+}
+
+// Return the number of t1's per t2
+int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
+  int i1 = t1 + 1;
+  int i2 = t2 + 1;
+  KMP_DEBUG_ASSERT(i1 <= i2);
+  KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
+  KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
+  // (nthreads/t2) / (nthreads/t1) = t1 / t2
+  return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
+}
+#endif // KMP_USE_HIER_SCHED
+
 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
 // affinity map.
 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
@@ -3953,12 +4026,22 @@ static AddrUnsPair *address2os = NULL;
 static int *procarr = NULL;
 static int __kmp_aff_depth = 0;
 
+#if KMP_USE_HIER_SCHED
+#define KMP_EXIT_AFF_NONE                                                      \
+  KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
+  KMP_ASSERT(address2os == NULL);                                              \
+  __kmp_apply_thread_places(NULL, 0);                                          \
+  __kmp_create_affinity_none_places();                                         \
+  __kmp_dispatch_set_hierarchy_values();                                       \
+  return;
+#else
 #define KMP_EXIT_AFF_NONE                                                      \
   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
   KMP_ASSERT(address2os == NULL);                                              \
   __kmp_apply_thread_places(NULL, 0);                                          \
   __kmp_create_affinity_none_places();                                         \
   return;
+#endif
 
 // Create a one element mask array (set of places) which only contains the
 // initial process's affinity mask
@@ -4300,6 +4383,10 @@ static void __kmp_aux_affinity_initialize(void) {
     KMP_ASSERT(address2os != NULL);
   }
 
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_set_hierarchy_values();
+#endif
+
   if (address2os == NULL) {
     if (KMP_AFFINITY_CAPABLE() &&
         (__kmp_affinity_verbose ||
diff --git a/runtime/src/kmp_atomic.h b/runtime/src/kmp_atomic.h
index 288916cd2..3b75a6bfd 100644
--- a/runtime/src/kmp_atomic.h
+++ b/runtime/src/kmp_atomic.h
@@ -364,7 +364,7 @@ static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
-        ompt_mutex_atomic, 0, kmp_mutex_impl_queuing, (ompt_wait_id_t)lck,
+        ompt_mutex_atomic, 0, kmp_mutex_impl_queuing, (omp_wait_id_t)lck,
         OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
@@ -374,7 +374,7 @@ static inline void __kmp_acquire_atomic_lock(kmp_atomic_lock_t *lck,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-        ompt_mutex_atomic, (ompt_wait_id_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_mutex_atomic, (omp_wait_id_t)lck, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
@@ -390,7 +390,7 @@ static inline void __kmp_release_atomic_lock(kmp_atomic_lock_t *lck,
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-        ompt_mutex_atomic, (ompt_wait_id_t)lck, OMPT_GET_RETURN_ADDRESS(0));
+        ompt_mutex_atomic, (omp_wait_id_t)lck, OMPT_GET_RETURN_ADDRESS(0));
   }
 #endif
 }
diff --git a/runtime/src/kmp_barrier.cpp b/runtime/src/kmp_barrier.cpp
index ca107cdec..bc905a921 100644
--- a/runtime/src/kmp_barrier.cpp
+++ b/runtime/src/kmp_barrier.cpp
@@ -956,14 +956,12 @@ static void __kmp_hierarchical_barrier_gather(
   // All subordinates are gathered; now release parent if not master thread
 
   if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
-    KA_TRACE(
-        20,
-        ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
-         "arrived(%p): %llu => %llu\n",
-         gtid, team->t.t_id, tid,
-         __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
-         thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
-         thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
+    KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
+                  " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
+                  gtid, team->t.t_id, tid,
+                  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
+                  thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
+                  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
     /* Mark arrival to parent: After performing this write, a worker thread may
        not assume that the team is valid any more - it could be deallocated by
        the master thread at any time. */
@@ -973,8 +971,8 @@ static void __kmp_hierarchical_barrier_gather(
       ANNOTATE_BARRIER_BEGIN(this_thr);
       kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
       flag.release();
-    } else { // Leaf does special release on the "offset" bits of parent's
-      // b_arrived flag
+    } else {
+      // Leaf does special release on "offset" bits of parent's b_arrived flag
       thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
       kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
       flag.set_waiter(other_threads[thr_bar->parent_tid]);
@@ -1353,10 +1351,10 @@ int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
 #endif
 
 #if OMP_40_ENABLED
+      kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
       // Reset cancellation flag for worksharing constructs
-      if (team->t.t_cancel_request == cancel_loop ||
-          team->t.t_cancel_request == cancel_sections) {
-        team->t.t_cancel_request = cancel_noreq;
+      if (cancel_request == cancel_loop || cancel_request == cancel_sections) {
+        KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
       }
 #endif
 #if USE_ITT_BUILD
diff --git a/runtime/src/kmp_cancel.cpp b/runtime/src/kmp_cancel.cpp
index 71c71ec08..3895f9783 100644
--- a/runtime/src/kmp_cancel.cpp
+++ b/runtime/src/kmp_cancel.cpp
@@ -51,8 +51,8 @@ kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
       {
         kmp_team_t *this_team = this_thr->th.th_team;
         KMP_DEBUG_ASSERT(this_team);
-        kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
-            &(this_team->t.t_cancel_request), cancel_noreq, cncl_kind);
+        kmp_int32 old = cancel_noreq;
+        this_team->t.t_cancel_request.compare_exchange_strong(old, cncl_kind);
         if (old == cancel_noreq || old == cncl_kind) {
 // we do not have a cancellation request in this team or we do have
 // one that matches the current request -> cancel
@@ -89,8 +89,8 @@ kmp_int32 __kmpc_cancel(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 cncl_kind) {
 
         taskgroup = task->td_taskgroup;
         if (taskgroup) {
-          kmp_int32 old = KMP_COMPARE_AND_STORE_RET32(
-              &(taskgroup->cancel_request), cancel_noreq, cncl_kind);
+          kmp_int32 old = cancel_noreq;
+          taskgroup->cancel_request.compare_exchange_strong(old, cncl_kind);
           if (old == cancel_noreq || old == cncl_kind) {
 // we do not have a cancellation request in this taskgroup or we do
 // have one that matches the current request -> cancel
@@ -257,7 +257,7 @@ kmp_int32 __kmpc_cancel_barrier(ident_t *loc, kmp_int32 gtid) {
   if (__kmp_omp_cancellation) {
     // depending on which construct to cancel, check the flag and
     // reset the flag
-    switch (this_team->t.t_cancel_request) {
+    switch (KMP_ATOMIC_LD_RLX(&(this_team->t.t_cancel_request))) {
     case cancel_parallel:
       ret = 1;
       // ensure that threads have checked the flag, when
diff --git a/runtime/src/kmp_config.h.cmake b/runtime/src/kmp_config.h.cmake
index 242c6807c..5e9b9c061 100644
--- a/runtime/src/kmp_config.h.cmake
+++ b/runtime/src/kmp_config.h.cmake
@@ -56,6 +56,8 @@
 #define KMP_USE_INTERNODE_ALIGNMENT LIBOMP_USE_INTERNODE_ALIGNMENT
 #cmakedefine01 LIBOMP_ENABLE_ASSERTIONS
 #define KMP_USE_ASSERT LIBOMP_ENABLE_ASSERTIONS
+#cmakedefine01 LIBOMP_USE_HIER_SCHED
+#define KMP_USE_HIER_SCHED LIBOMP_USE_HIER_SCHED
 #cmakedefine01 STUBS_LIBRARY
 #cmakedefine01 LIBOMP_USE_HWLOC
 #define KMP_USE_HWLOC LIBOMP_USE_HWLOC
diff --git a/runtime/src/kmp_csupport.cpp b/runtime/src/kmp_csupport.cpp
index 6bbae57ea..da4530c2f 100644
--- a/runtime/src/kmp_csupport.cpp
+++ b/runtime/src/kmp_csupport.cpp
@@ -276,7 +276,7 @@ void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...) {
     va_start(ap, microtask);
 
 #if OMPT_SUPPORT
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       kmp_info_t *master_th = __kmp_threads[gtid];
       kmp_team_t *parent_team = master_th->th.th_team;
@@ -677,7 +677,7 @@ void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid) {
   }
 
 #if OMPT_SUPPORT
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame == NULL)
@@ -815,12 +815,12 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   kmp_team_t *team;
-  ompt_wait_id_t lck;
+  omp_wait_id_t lck;
   void *codeptr_ra;
   if (ompt_enabled.enabled) {
     OMPT_STORE_RETURN_ADDRESS(gtid);
     team = __kmp_team_from_gtid(gtid);
-    lck = (ompt_wait_id_t)&team->t.t_ordered.dt.t_value;
+    lck = (omp_wait_id_t)&team->t.t_ordered.dt.t_value;
     /* OMPT state update */
     th->th.ompt_thread_info.wait_id = lck;
     th->th.ompt_thread_info.state = omp_state_wait_ordered;
@@ -830,7 +830,7 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_ordered, omp_lock_hint_none, kmp_mutex_impl_spin,
-          (ompt_wait_id_t)lck, codeptr_ra);
+          (omp_wait_id_t)lck, codeptr_ra);
     }
   }
 #endif
@@ -849,7 +849,7 @@ void __kmpc_ordered(ident_t *loc, kmp_int32 gtid) {
     /* OMPT event callback */
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-          ompt_mutex_ordered, (ompt_wait_id_t)lck, codeptr_ra);
+          ompt_mutex_ordered, (omp_wait_id_t)lck, codeptr_ra);
     }
   }
 #endif
@@ -889,7 +889,7 @@ void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid) {
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
         ompt_mutex_ordered,
-        (ompt_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
+        (omp_wait_id_t)&__kmp_team_from_gtid(gtid)->t.t_ordered.dt.t_value,
         OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
@@ -930,9 +930,10 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid)                                       \
   {                                                                            \
     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
-    if (l->lk.poll != KMP_LOCK_FREE(tas) ||                                    \
-        !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
-                                     KMP_LOCK_BUSY(gtid + 1, tas))) {          \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    if (KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                          \
+        !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {    \
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(l);                                                    \
       KMP_INIT_YIELD(spins);                                                   \
@@ -943,9 +944,9 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
         KMP_YIELD_SPIN(spins);                                                 \
       }                                                                        \
       kmp_backoff_t backoff = __kmp_spin_backoff_params;                       \
-      while (l->lk.poll != KMP_LOCK_FREE(tas) ||                               \
-             !KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),   \
-                                          KMP_LOCK_BUSY(gtid + 1, tas))) {     \
+      while (                                                                  \
+          KMP_ATOMIC_LD_RLX(&l->lk.poll) != tas_free ||                        \
+          !__kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy)) {  \
         __kmp_spin_backoff(&backoff);                                          \
         if (TCR_4(__kmp_nth) >                                                 \
             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
@@ -962,17 +963,15 @@ __kmp_init_indirect_csptr(kmp_critical_name *crit, ident_t const *loc,
 #define KMP_TEST_TAS_LOCK(lock, gtid, rc)                                      \
   {                                                                            \
     kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock;                                \
-    rc = l->lk.poll == KMP_LOCK_FREE(tas) &&                                   \
-         KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas),        \
-                                     KMP_LOCK_BUSY(gtid + 1, tas));            \
+    kmp_int32 tas_free = KMP_LOCK_FREE(tas);                                   \
+    kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);                         \
+    rc = KMP_ATOMIC_LD_RLX(&l->lk.poll) == tas_free &&                         \
+         __kmp_atomic_compare_store_acq(&l->lk.poll, tas_free, tas_busy);      \
   }
 
 // Fast-path release tas lock
 #define KMP_RELEASE_TAS_LOCK(lock, gtid)                                       \
-  {                                                                            \
-    TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas));              \
-    KMP_MB();                                                                  \
-  }
+  { KMP_ATOMIC_ST_REL(&((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); }
 
 #if KMP_USE_FUTEX
 
@@ -1162,7 +1161,7 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
     ti = __kmp_threads[global_tid]->th.ompt_thread_info;
     /* OMPT state update */
     prev_state = ti.state;
-    ti.wait_id = (ompt_wait_id_t)lck;
+    ti.wait_id = (omp_wait_id_t)lck;
     ti.state = omp_state_wait_critical;
 
     /* OMPT event callback */
@@ -1170,7 +1169,7 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_critical, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
-          (ompt_wait_id_t)crit, codeptr_ra);
+          (omp_wait_id_t)crit, codeptr_ra);
     }
   }
 #endif
@@ -1190,7 +1189,7 @@ void __kmpc_critical(ident_t *loc, kmp_int32 global_tid,
     /* OMPT event callback */
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-          ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr_ra);
+          ompt_mutex_critical, (omp_wait_id_t)crit, codeptr_ra);
     }
   }
 #endif
@@ -1372,14 +1371,14 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
       /* OMPT state update */
       prev_state = ti.state;
-      ti.wait_id = (ompt_wait_id_t)lck;
+      ti.wait_id = (omp_wait_id_t)lck;
       ti.state = omp_state_wait_critical;
 
       /* OMPT event callback */
       if (ompt_enabled.ompt_callback_mutex_acquire) {
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
             ompt_mutex_critical, (unsigned int)hint,
-            __ompt_get_mutex_impl_type(crit), (ompt_wait_id_t)crit, codeptr);
+            __ompt_get_mutex_impl_type(crit), (omp_wait_id_t)crit, codeptr);
       }
     }
 #endif
@@ -1410,14 +1409,14 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
       ti = __kmp_threads[global_tid]->th.ompt_thread_info;
       /* OMPT state update */
       prev_state = ti.state;
-      ti.wait_id = (ompt_wait_id_t)lck;
+      ti.wait_id = (omp_wait_id_t)lck;
       ti.state = omp_state_wait_critical;
 
       /* OMPT event callback */
       if (ompt_enabled.ompt_callback_mutex_acquire) {
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
             ompt_mutex_critical, (unsigned int)hint,
-            __ompt_get_mutex_impl_type(0, ilk), (ompt_wait_id_t)crit, codeptr);
+            __ompt_get_mutex_impl_type(0, ilk), (omp_wait_id_t)crit, codeptr);
       }
     }
 #endif
@@ -1436,7 +1435,7 @@ void __kmpc_critical_with_hint(ident_t *loc, kmp_int32 global_tid,
     /* OMPT event callback */
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-          ompt_mutex_critical, (ompt_wait_id_t)crit, codeptr);
+          ompt_mutex_critical, (omp_wait_id_t)crit, codeptr);
     }
   }
 #endif
@@ -1534,7 +1533,7 @@ void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid,
   OMPT_STORE_RETURN_ADDRESS(global_tid);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-        ompt_mutex_critical, (ompt_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
+        ompt_mutex_critical, (omp_wait_id_t)crit, OMPT_LOAD_RETURN_ADDRESS(0));
   }
 #endif
 
@@ -1563,7 +1562,7 @@ kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid) {
     __kmp_check_barrier(global_tid, ct_barrier, loc);
 
 #if OMPT_SUPPORT
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame == NULL)
@@ -1625,7 +1624,7 @@ kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid) {
   }
 
 #if OMPT_SUPPORT
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame == NULL)
@@ -1975,7 +1974,7 @@ void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
     *data_ptr = cpy_data;
 
 #if OMPT_SUPPORT
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     if (ompt_frame->enter_frame == NULL)
@@ -2105,7 +2104,7 @@ void __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock,
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_lock, (omp_lock_hint_t)hint,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2129,7 +2128,7 @@ void __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid,
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_nest_lock, (omp_lock_hint_t)hint,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2155,7 +2154,7 @@ void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_lock, omp_lock_hint_none,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2198,7 +2197,7 @@ void __kmpc_init_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
-        (ompt_wait_id_t)user_lock, codeptr);
+        (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2227,7 +2226,7 @@ void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_nest_lock, omp_lock_hint_none,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2273,7 +2272,7 @@ void __kmpc_init_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_lock_init) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_init)(
         ompt_mutex_nest_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
-        (ompt_wait_id_t)user_lock, codeptr);
+        (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2309,7 +2308,7 @@ void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       lck = (kmp_user_lock_p)user_lock;
     }
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
-        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
@@ -2337,7 +2336,7 @@ void __kmpc_destroy_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
-        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2377,7 +2376,7 @@ void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
-        ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+        ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
   KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
@@ -2409,7 +2408,7 @@ void __kmpc_destroy_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_lock_destroy) {
     ompt_callbacks.ompt_callback(ompt_callback_lock_destroy)(
-        ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+        ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2454,7 +2453,7 @@ void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2476,7 +2475,7 @@ void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2509,7 +2508,7 @@ void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
-        (ompt_wait_id_t)lck, codeptr);
+        (omp_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2522,7 +2521,7 @@ void __kmpc_set_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-        ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2544,7 +2543,7 @@ void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_nest_lock, omp_lock_hint_none,
-          __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+          __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
           codeptr);
     }
   }
@@ -2561,13 +2560,13 @@ void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+            ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-            ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
+            ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
       }
     }
   }
@@ -2605,7 +2604,7 @@ void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     if (ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_nest_lock, omp_lock_hint_none,
-          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
+          __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
     }
   }
 #endif
@@ -2622,13 +2621,13 @@ void __kmpc_set_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+            ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-            ompt_scope_begin, (ompt_wait_id_t)lck, codeptr);
+            ompt_scope_begin, (omp_wait_id_t)lck, codeptr);
       }
     }
   }
@@ -2664,7 +2663,7 @@ void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-        ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
   }
 #endif
 
@@ -2693,7 +2692,7 @@ void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       codeptr = OMPT_GET_RETURN_ADDRESS(0);
     if (ompt_enabled.ompt_callback_mutex_released) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-          ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
+          ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
     }
 #endif
 
@@ -2725,7 +2724,7 @@ void __kmpc_unset_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
     codeptr = OMPT_GET_RETURN_ADDRESS(0);
   if (ompt_enabled.ompt_callback_mutex_released) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-        ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2752,12 +2751,12 @@ void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_released) {
         // release_lock_last
         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+            ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
       }
     } else if (ompt_enabled.ompt_callback_nest_lock) {
       // release_lock_prev
       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-          ompt_scope_end, (ompt_wait_id_t)user_lock, codeptr);
+          ompt_scope_end, (omp_wait_id_t)user_lock, codeptr);
     }
   }
 #endif
@@ -2801,12 +2800,12 @@ void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
         if (ompt_enabled.ompt_callback_mutex_released) {
           // release_lock_last
           ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-              ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+              ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
         }
       } else if (ompt_enabled.ompt_callback_nest_lock) {
         // release_lock_previous
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-            ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
+            ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
       }
     }
 #endif
@@ -2843,12 +2842,12 @@ void __kmpc_unset_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_released) {
         // release_lock_last
         ompt_callbacks.ompt_callback(ompt_callback_mutex_released)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+            ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
       }
     } else if (ompt_enabled.ompt_callback_nest_lock) {
       // release_lock_previous
       ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-          ompt_mutex_scope_end, (ompt_wait_id_t)lck, codeptr);
+          ompt_mutex_scope_end, (omp_wait_id_t)lck, codeptr);
     }
   }
 #endif
@@ -2874,7 +2873,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2897,7 +2896,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
     if (ompt_enabled.ompt_callback_mutex_acquired) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-          ompt_mutex_lock, (ompt_wait_id_t)user_lock, codeptr);
+          ompt_mutex_lock, (omp_wait_id_t)user_lock, codeptr);
     }
 #endif
     return FTN_TRUE;
@@ -2938,7 +2937,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_lock, omp_lock_hint_none, __ompt_get_mutex_impl_type(),
-        (ompt_wait_id_t)lck, codeptr);
+        (omp_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2953,7 +2952,7 @@ int __kmpc_test_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
 #if OMPT_SUPPORT && OMPT_OPTIONAL
   if (rc && ompt_enabled.ompt_callback_mutex_acquired) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-        ompt_mutex_lock, (ompt_wait_id_t)lck, codeptr);
+        ompt_mutex_lock, (omp_wait_id_t)lck, codeptr);
   }
 #endif
 
@@ -2979,7 +2978,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
   if (ompt_enabled.ompt_callback_mutex_acquire) {
     ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
         ompt_mutex_nest_lock, omp_lock_hint_none,
-        __ompt_get_mutex_impl_type(user_lock), (ompt_wait_id_t)user_lock,
+        __ompt_get_mutex_impl_type(user_lock), (omp_wait_id_t)user_lock,
         codeptr);
   }
 #endif
@@ -2997,13 +2996,13 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)user_lock, codeptr);
+            ompt_mutex_nest_lock, (omp_wait_id_t)user_lock, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-            ompt_scope_begin, (ompt_wait_id_t)user_lock, codeptr);
+            ompt_scope_begin, (omp_wait_id_t)user_lock, codeptr);
       }
     }
   }
@@ -3044,7 +3043,7 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
         ompt_enabled.ompt_callback_mutex_acquire) {
       ompt_callbacks.ompt_callback(ompt_callback_mutex_acquire)(
           ompt_mutex_nest_lock, omp_lock_hint_none,
-          __ompt_get_mutex_impl_type(), (ompt_wait_id_t)lck, codeptr);
+          __ompt_get_mutex_impl_type(), (omp_wait_id_t)lck, codeptr);
     }
 #endif
 
@@ -3062,13 +3061,13 @@ int __kmpc_test_nest_lock(ident_t *loc, kmp_int32 gtid, void **user_lock) {
       if (ompt_enabled.ompt_callback_mutex_acquired) {
         // lock_first
         ompt_callbacks.ompt_callback(ompt_callback_mutex_acquired)(
-            ompt_mutex_nest_lock, (ompt_wait_id_t)lck, codeptr);
+            ompt_mutex_nest_lock, (omp_wait_id_t)lck, codeptr);
       }
     } else {
       if (ompt_enabled.ompt_callback_nest_lock) {
         // lock_next
         ompt_callbacks.ompt_callback(ompt_callback_nest_lock)(
-            ompt_mutex_scope_begin, (ompt_wait_id_t)lck, codeptr);
+            ompt_mutex_scope_begin, (omp_wait_id_t)lck, codeptr);
       }
     }
   }
@@ -3359,7 +3358,7 @@ __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
     // JP: as long as there is a barrier in the implementation, OMPT should and
     // will provide the barrier events
     //         so we set-up the necessary frame/return addresses.
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame == NULL)
@@ -3540,7 +3539,7 @@ kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
 // this barrier should be visible to a customer and to the threading profile
 // tool (it's a terminating barrier on constructs if NOWAIT not specified)
 #if OMPT_SUPPORT
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame == NULL)
@@ -3626,7 +3625,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
 
 // TODO: implicit barrier: should be exposed
 #if OMPT_SUPPORT
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame == NULL)
@@ -3650,7 +3649,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
 
 // TODO: implicit barrier: should be exposed
 #if OMPT_SUPPORT
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame == NULL)
@@ -3671,7 +3670,7 @@ void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
   } else if (packed_reduction_method == atomic_reduce_block) {
 
 #if OMPT_SUPPORT
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     if (ompt_enabled.enabled) {
       __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
       if (ompt_frame->enter_frame == NULL)
diff --git a/runtime/src/kmp_debugger.cpp b/runtime/src/kmp_debugger.cpp
index 3484f7a01..c02d251b0 100644
--- a/runtime/src/kmp_debugger.cpp
+++ b/runtime/src/kmp_debugger.cpp
@@ -68,7 +68,9 @@ kmp_omp_struct_info_t __kmp_omp_debug_struct_info = {
     addr_and_size_of(__kmp_threads),
     addr_and_size_of(__kmp_root),
     addr_and_size_of(__kmp_threads_capacity),
+#if KMP_USE_MONITOR
     addr_and_size_of(__kmp_monitor),
+#endif
 #if !KMP_USE_DYNAMIC_LOCK
     addr_and_size_of(__kmp_user_lock_table),
 #endif
diff --git a/runtime/src/kmp_dispatch.cpp b/runtime/src/kmp_dispatch.cpp
index a821ec192..1306c553d 100644
--- a/runtime/src/kmp_dispatch.cpp
+++ b/runtime/src/kmp_dispatch.cpp
@@ -34,275 +34,20 @@
 #if KMP_OS_WINDOWS && KMP_ARCH_X86
 #include <float.h>
 #endif
+#include "kmp_lock.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
 #endif
 
 /* ------------------------------------------------------------------------ */
-
-#if KMP_STATIC_STEAL_ENABLED
-
-// replaces dispatch_private_info{32,64} structures and
-// dispatch_private_info{32,64}_t types
-template <typename T> struct dispatch_private_infoXX_template {
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-  UT count; // unsigned
-  T ub;
-  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
-  T lb;
-  ST st; // signed
-  UT tc; // unsigned
-  T static_steal_counter; // for static_steal only; maybe better to put after ub
-
-  /* parm[1-4] are used in different ways by different scheduling algorithms */
-
-  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
-  //    a) parm3 is properly aligned and
-  //    b) all parm1-4 are in the same cache line.
-  // Because of parm1-4 are used together, performance seems to be better
-  // if they are in the same line (not measured though).
-
-  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
-    T parm1;
-    T parm2;
-    T parm3;
-    T parm4;
-  };
-
-  UT ordered_lower; // unsigned
-  UT ordered_upper; // unsigned
-#if KMP_OS_WINDOWS
-  T last_upper;
-#endif /* KMP_OS_WINDOWS */
-};
-
-#else /* KMP_STATIC_STEAL_ENABLED */
-
-// replaces dispatch_private_info{32,64} structures and
-// dispatch_private_info{32,64}_t types
-template <typename T> struct dispatch_private_infoXX_template {
-  typedef typename traits_t<T>::unsigned_t UT;
-  typedef typename traits_t<T>::signed_t ST;
-  T lb;
-  T ub;
-  ST st; // signed
-  UT tc; // unsigned
-
-  T parm1;
-  T parm2;
-  T parm3;
-  T parm4;
-
-  UT count; // unsigned
-
-  UT ordered_lower; // unsigned
-  UT ordered_upper; // unsigned
-#if KMP_OS_WINDOWS
-  T last_upper;
-#endif /* KMP_OS_WINDOWS */
-};
-
-#endif /* KMP_STATIC_STEAL_ENABLED */
-
-// replaces dispatch_private_info structure and dispatch_private_info_t type
-template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
-  // duplicate alignment here, otherwise size of structure is not correct in our
-  // compiler
-  union KMP_ALIGN_CACHE private_info_tmpl {
-    dispatch_private_infoXX_template<T> p;
-    dispatch_private_info64_t p64;
-  } u;
-  enum sched_type schedule; /* scheduling algorithm */
-  kmp_uint32 ordered; /* ordered clause specified */
-  kmp_uint32 ordered_bumped;
-  // To retain the structure size after making ordered_iteration scalar
-  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
-  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
-  kmp_uint32 nomerge; /* don't merge iters if serialized */
-  kmp_uint32 type_size;
-  enum cons_type pushed_ws;
-};
-
-// replaces dispatch_shared_info{32,64} structures and
-// dispatch_shared_info{32,64}_t types
-template <typename UT> struct dispatch_shared_infoXX_template {
-  /* chunk index under dynamic, number of idle threads under static-steal;
-     iteration index otherwise */
-  volatile UT iteration;
-  volatile UT num_done;
-  volatile UT ordered_iteration;
-  // to retain the structure size making ordered_iteration scalar
-  UT ordered_dummy[KMP_MAX_ORDERED - 3];
-};
-
-// replaces dispatch_shared_info structure and dispatch_shared_info_t type
-template <typename UT> struct dispatch_shared_info_template {
-  // we need union here to keep the structure size
-  union shared_info_tmpl {
-    dispatch_shared_infoXX_template<UT> s;
-    dispatch_shared_info64_t s64;
-  } u;
-  volatile kmp_uint32 buffer_index;
-#if OMP_45_ENABLED
-  volatile kmp_int32 doacross_buf_idx; // teamwise index
-  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
-  kmp_int32 doacross_num_done; // count finished threads
-#endif
-#if KMP_USE_HWLOC
-  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
-  // machines (> 48 cores). Performance analysis showed that a cache thrash
-  // was occurring and this padding helps alleviate the problem.
-  char padding[64];
-#endif
-};
-
-/* ------------------------------------------------------------------------ */
-
-#undef USE_TEST_LOCKS
-
-// test_then_add template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
-
-template <>
-__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
-                                                 kmp_int32 d) {
-  kmp_int32 r;
-  r = KMP_TEST_THEN_ADD32(p, d);
-  return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
-                                                 kmp_int64 d) {
-  kmp_int64 r;
-  r = KMP_TEST_THEN_ADD64(p, d);
-  return r;
-}
-
-// test_then_inc_acq template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
-
-template <>
-__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
-  kmp_int32 r;
-  r = KMP_TEST_THEN_INC_ACQ32(p);
-  return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
-  kmp_int64 r;
-  r = KMP_TEST_THEN_INC_ACQ64(p);
-  return r;
-}
-
-// test_then_inc template (general template should NOT be used)
-template <typename T> static __forceinline T test_then_inc(volatile T *p);
-
-template <>
-__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
-  kmp_int32 r;
-  r = KMP_TEST_THEN_INC32(p);
-  return r;
-}
-
-template <>
-__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
-  kmp_int64 r;
-  r = KMP_TEST_THEN_INC64(p);
-  return r;
-}
-
-// compare_and_swap template (general template should NOT be used)
-template <typename T>
-static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
-
-template <>
-__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
-                                                    kmp_int32 c, kmp_int32 s) {
-  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
-}
-
-template <>
-__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
-                                                    kmp_int64 c, kmp_int64 s) {
-  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
-}
-
-/* Spin wait loop that first does pause, then yield.
-    Waits until function returns non-zero when called with *spinner and check.
-    Does NOT put threads to sleep.
-    Arguments:
-        UT is unsigned 4- or 8-byte type
-        spinner - memory location to check value
-        checker - value which spinner is >, <, ==, etc.
-        pred - predicate function to perform binary comparison of some sort
-#if USE_ITT_BUILD
-        obj -- is higher-level synchronization object to report to ittnotify.
-        It is used to report locks consistently. For example, if lock is
-        acquired immediately, its address is reported to ittnotify via
-        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
-        and lock routine calls to KMP_WAIT_YIELD(), the later should report the
-        same address, not an address of low-level spinner.
-#endif // USE_ITT_BUILD
-    TODO: make inline function (move to header file for icl)
-*/
-template <typename UT>
-static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
-                           kmp_uint32 (*pred)(UT, UT)
-                               USE_ITT_BUILD_ARG(void *obj)) {
-  // note: we may not belong to a team at this point
-  volatile UT *spin = spinner;
-  UT check = checker;
-  kmp_uint32 spins;
-  kmp_uint32 (*f)(UT, UT) = pred;
-  UT r;
-
-  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
-  KMP_INIT_YIELD(spins);
-  // main wait spin loop
-  while (!f(r = *spin, check)) {
-    KMP_FSYNC_SPIN_PREPARE(obj);
-    /* GEH - remove this since it was accidentally introduced when kmp_wait was
-       split. It causes problems with infinite recursion because of exit lock */
-    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
-        __kmp_abort_thread(); */
-
-    // if we are oversubscribed, or have waited a bit (and
-    // KMP_LIBRARY=throughput, then yield. pause is in the following code
-    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
-    KMP_YIELD_SPIN(spins);
-  }
-  KMP_FSYNC_SPIN_ACQUIRED(obj);
-  return r;
-}
-
-template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
-  return value == checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
-  return value != checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
-  return value < checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
-  return value >= checker;
-}
-
-template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
-  return value <= checker;
-}
-
 /* ------------------------------------------------------------------------ */
 
-static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
-                                     ident_t *loc_ref) {
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   kmp_info_t *th;
 
   KMP_DEBUG_ASSERT(gtid_ref);
@@ -320,85 +65,7 @@ static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
   }
 }
 
-template <typename UT>
-static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
-  typedef typename traits_t<UT>::signed_t ST;
-  dispatch_private_info_template<UT> *pr;
-
-  int gtid = *gtid_ref;
-  //    int  cid = *cid_ref;
-  kmp_info_t *th = __kmp_threads[gtid];
-  KMP_DEBUG_ASSERT(th->th.th_dispatch);
-
-  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
-  if (__kmp_env_consistency_check) {
-    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-        th->th.th_dispatch->th_dispatch_pr_current);
-    if (pr->pushed_ws != ct_none) {
-#if KMP_USE_DYNAMIC_LOCK
-      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
-#else
-      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
-#endif
-    }
-  }
-
-  if (!th->th.th_team->t.t_serialized) {
-    dispatch_shared_info_template<UT> *sh =
-        reinterpret_cast<dispatch_shared_info_template<UT> *>(
-            th->th.th_dispatch->th_dispatch_sh_current);
-    UT lower;
-
-    if (!__kmp_env_consistency_check) {
-      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-          th->th.th_dispatch->th_dispatch_pr_current);
-    }
-    lower = pr->u.p.ordered_lower;
-
-#if !defined(KMP_GOMP_COMPAT)
-    if (__kmp_env_consistency_check) {
-      if (pr->ordered_bumped) {
-        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
-        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
-                               ct_ordered_in_pdo, loc_ref,
-                               &p->stack_data[p->w_top]);
-      }
-    }
-#endif /* !defined(KMP_GOMP_COMPAT) */
-
-    KMP_MB();
-#ifdef KMP_DEBUG
-    {
-      char *buff;
-      // create format specifiers before the debug output
-      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
-                              "ordered_iter:%%%s lower:%%%s\n",
-                              traits_t<UT>::spec, traits_t<UT>::spec);
-      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
-      __kmp_str_free(&buff);
-    }
-#endif
-
-    __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
-                         __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
-    KMP_MB(); /* is this necessary? */
-#ifdef KMP_DEBUG
-    {
-      char *buff;
-      // create format specifiers before the debug output
-      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
-                              "ordered_iter:%%%s lower:%%%s\n",
-                              traits_t<UT>::spec, traits_t<UT>::spec);
-      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
-      __kmp_str_free(&buff);
-    }
-#endif
-  }
-  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
-}
-
-static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
-                                     ident_t *loc_ref) {
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
   kmp_info_t *th;
 
   if (__kmp_env_consistency_check) {
@@ -409,121 +76,26 @@ static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
   }
 }
 
-template <typename UT>
-static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
-  typedef typename traits_t<UT>::signed_t ST;
-  dispatch_private_info_template<UT> *pr;
-
-  int gtid = *gtid_ref;
-  //    int  cid = *cid_ref;
-  kmp_info_t *th = __kmp_threads[gtid];
-  KMP_DEBUG_ASSERT(th->th.th_dispatch);
-
-  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
-  if (__kmp_env_consistency_check) {
-    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-        th->th.th_dispatch->th_dispatch_pr_current);
-    if (pr->pushed_ws != ct_none) {
-      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
-    }
-  }
-
-  if (!th->th.th_team->t.t_serialized) {
-    dispatch_shared_info_template<UT> *sh =
-        reinterpret_cast<dispatch_shared_info_template<UT> *>(
-            th->th.th_dispatch->th_dispatch_sh_current);
-
-    if (!__kmp_env_consistency_check) {
-      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
-          th->th.th_dispatch->th_dispatch_pr_current);
-    }
-
-    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
-#if !defined(KMP_GOMP_COMPAT)
-    if (__kmp_env_consistency_check) {
-      if (pr->ordered_bumped != 0) {
-        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
-        /* How to test it? - OM */
-        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
-                               ct_ordered_in_pdo, loc_ref,
-                               &p->stack_data[p->w_top]);
-      }
-    }
-#endif /* !defined(KMP_GOMP_COMPAT) */
-
-    KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-    pr->ordered_bumped += 1;
-
-    KD_TRACE(1000,
-             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
-              gtid, pr->ordered_bumped));
-
-    KMP_MB(); /* Flush all pending memory write invalidates.  */
-
-    /* TODO use general release procedure? */
-    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
-
-    KMP_MB(); /* Flush all pending memory write invalidates.  */
-  }
-  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
-}
-
-// Computes and returns x to the power of y, where y must a non-negative integer
-template <typename UT>
-static __forceinline long double __kmp_pow(long double x, UT y) {
-  long double s = 1.0L;
-
-  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
-  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
-  while (y) {
-    if (y & 1)
-      s *= x;
-    x *= x;
-    y >>= 1;
-  }
-  return s;
-}
-
-/* Computes and returns the number of unassigned iterations after idx chunks
-   have been assigned (the total number of unassigned iterations in chunks with
-   index greater than or equal to idx). __forceinline seems to be broken so that
-   if we __forceinline this function, the behavior is wrong
-   (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
-template <typename T>
-static __inline typename traits_t<T>::unsigned_t
-__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
-                                typename traits_t<T>::unsigned_t idx) {
-  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
-     ICL 8.1, long double arithmetic may not really have long double precision,
-     even with /Qlong_double.  Currently, we workaround that in the caller code,
-     by manipulating the FPCW for Windows* OS on IA-32 architecture.  The lack
-     of precision is not expected to be a correctness issue, though. */
-  typedef typename traits_t<T>::unsigned_t UT;
-
-  long double x = tc * __kmp_pow<UT>(base, idx);
-  UT r = (UT)x;
-  if (x == r)
-    return r;
-  return r + 1;
-}
-
-// Parameters of the guided-iterative algorithm:
-//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
-//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
-// by default n = 2. For example with n = 3 the chunks distribution will be more
-// flat.
-// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
-static int guided_int_param = 2;
-static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
-
-// UT - unsigned flavor of T, ST - signed flavor of T,
-// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+// Initialize a dispatch_private_info_template<T> buffer for a particular
+// type of schedule,chunk.  The loop description is found in lb (lower bound),
+// ub (upper bound), and st (stride).  nproc is the number of threads relevant
+// to the scheduling (often the number of threads in a team, but not always if
+// hierarchical scheduling is used).  tid is the id of the thread calling
+// the function within the group of nproc threads.  It will have a value
+// between 0 and nproc - 1.  This is often just the thread id within a team, but
+// is not necessarily the case when using hierarchical scheduling.
+// loc is the source file location of the corresponding loop
+// gtid is the global thread id
 template <typename T>
-static void
-__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
-                    T ub, typename traits_t<T>::signed_t st,
-                    typename traits_t<T>::signed_t chunk, int push_ws) {
+void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                   dispatch_private_info_template<T> *pr,
+                                   enum sched_type schedule, T lb, T ub,
+                                   typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                   kmp_uint64 *cur_chunk,
+#endif
+                                   typename traits_t<T>::signed_t chunk,
+                                   T nproc, T tid) {
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   typedef typename traits_t<T>::floating_t DBL;
@@ -532,30 +104,18 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   T tc;
   kmp_info_t *th;
   kmp_team_t *team;
-  kmp_uint32 my_buffer_index;
-  dispatch_private_info_template<T> *pr;
-  dispatch_shared_info_template<UT> volatile *sh;
-
-  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
-                   sizeof(dispatch_private_info));
-  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
-                   sizeof(dispatch_shared_info));
-
-  if (!TCR_4(__kmp_init_parallel))
-    __kmp_parallel_initialize();
 
-#if INCLUDE_SSC_MARKS
-  SSC_MARK_DISPATCH_INIT();
-#endif
 #ifdef KMP_DEBUG
   {
     char *buff;
     // create format specifiers before the debug output
-    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
-                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
-                            traits_t<ST>::spec, traits_t<T>::spec,
-                            traits_t<T>::spec, traits_t<ST>::spec);
-    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+    buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
+                            "pr:%%p lb:%%%s ub:%%%s st:%%%s "
+                            "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
+                            traits_t<T>::spec, traits_t<T>::spec,
+                            traits_t<ST>::spec, traits_t<ST>::spec,
+                            traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
     __kmp_str_free(&buff);
   }
 #endif
@@ -563,10 +123,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   th = __kmp_threads[gtid];
   team = th->th.th_team;
   active = !team->t.t_serialized;
-  th->th.th_ident = loc;
 
 #if USE_ITT_BUILD
-  kmp_uint64 cur_chunk = chunk;
   int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
                                     __kmp_forkjoin_frames_mode == 3 &&
                                     KMP_MASTER_GTID(gtid) &&
@@ -575,23 +133,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 #endif
                                     team->t.t_active_level == 1;
 #endif
-  if (!active) {
-    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
-        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
-  } else {
-    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
-                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
-
-    my_buffer_index = th->th.th_dispatch->th_disp_index++;
-
-    /* What happens when number of threads changes, need to resize buffer? */
-    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
-        &th->th.th_dispatch
-             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
-    sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
-        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
-  }
-
 #if (KMP_STATIC_STEAL_ENABLED)
   if (SCHEDULE_HAS_NONMONOTONIC(schedule))
     // AC: we now have only one implementation of stealing, so use it
@@ -602,19 +143,19 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 
   /* Pick up the nomerge/ordered bits from the scheduling type */
   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
-    pr->nomerge = TRUE;
+    pr->flags.nomerge = TRUE;
     schedule =
         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
   } else {
-    pr->nomerge = FALSE;
+    pr->flags.nomerge = FALSE;
   }
   pr->type_size = traits_t<T>::type_size; // remember the size of variables
   if (kmp_ord_lower & schedule) {
-    pr->ordered = TRUE;
+    pr->flags.ordered = TRUE;
     schedule =
         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
   } else {
-    pr->ordered = FALSE;
+    pr->flags.ordered = FALSE;
   }
 
   if (schedule == kmp_sch_static) {
@@ -635,15 +176,16 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
       // specified)
       chunk = team->t.t_sched.chunk;
 #if USE_ITT_BUILD
-      cur_chunk = chunk;
+      if (cur_chunk)
+        *cur_chunk = chunk;
 #endif
 #ifdef KMP_DEBUG
       {
         char *buff;
         // create format specifiers before the debug output
-        buff = __kmp_str_format(
-            "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
-            traits_t<ST>::spec);
+        buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
+                                "schedule:%%d chunk:%%%s\n",
+                                traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, schedule, chunk));
         __kmp_str_free(&buff);
       }
@@ -664,9 +206,10 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
       {
         char *buff;
         // create format specifiers before the debug output
-        buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
-                                "schedule:%%d chunk:%%%s\n",
-                                traits_t<ST>::spec);
+        buff = __kmp_str_format(
+            "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
+            "schedule:%%d chunk:%%%s\n",
+            traits_t<ST>::spec);
         KD_TRACE(10, (buff, gtid, schedule, chunk));
         __kmp_str_free(&buff);
       }
@@ -674,11 +217,11 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     }
 
     /* guided analytical not safe for too many threads */
-    if (schedule == kmp_sch_guided_analytical_chunked &&
-        th->th.th_team_nproc > 1 << 20) {
+    if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
       schedule = kmp_sch_guided_iterative_chunked;
       KMP_WARNING(DispatchManyThreads);
     }
+#if OMP_45_ENABLED
     if (schedule == kmp_sch_runtime_simd) {
       // compiler provides simd_width in the chunk parameter
       schedule = team->t.t_sched.r_sched_type;
@@ -694,7 +237,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         chunk = team->t.t_sched.chunk * chunk;
       }
 #if USE_ITT_BUILD
-      cur_chunk = chunk;
+      if (cur_chunk)
+        *cur_chunk = chunk;
 #endif
 #ifdef KMP_DEBUG
       {
@@ -708,6 +252,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
       }
 #endif
     }
+#endif // OMP_45_ENABLED
     pr->u.p.parm1 = chunk;
   }
   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
@@ -718,7 +263,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   if (__kmp_env_consistency_check) {
     if (st == 0) {
       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
-                            (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
+                            (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
     }
   }
   // compute trip count
@@ -746,16 +291,6 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     }
   }
 
-  // Any half-decent optimizer will remove this test when the blocks are empty
-  // since the macros expand to nothing when statistics are disabled.
-  if (schedule == __kmp_static) {
-    KMP_COUNT_BLOCK(OMP_FOR_static);
-    KMP_COUNT_VALUE(FOR_static_iterations, tc);
-  } else {
-    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
-    KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
-  }
-
   pr->u.p.lb = lb;
   pr->u.p.ub = ub;
   pr->u.p.st = st;
@@ -768,44 +303,26 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   /* NOTE: only the active parallel region(s) has active ordered sections */
 
   if (active) {
-    if (pr->ordered == 0) {
-      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
-      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
-    } else {
+    if (pr->flags.ordered) {
       pr->ordered_bumped = 0;
-
       pr->u.p.ordered_lower = 1;
       pr->u.p.ordered_upper = 0;
-
-      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
-      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
-    }
-  }
-
-  if (__kmp_env_consistency_check) {
-    enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
-    if (push_ws) {
-      __kmp_push_workshare(gtid, ws, loc);
-      pr->pushed_ws = ws;
-    } else {
-      __kmp_check_workshare(gtid, ws, loc);
-      pr->pushed_ws = ct_none;
     }
   }
 
   switch (schedule) {
 #if (KMP_STATIC_STEAL_ENABLED)
   case kmp_sch_static_steal: {
-    T nproc = th->th.th_team_nproc;
     T ntc, init;
 
     KD_TRACE(100,
-             ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
 
     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
     if (nproc > 1 && ntc >= nproc) {
       KMP_COUNT_BLOCK(OMP_FOR_static_steal);
-      T id = __kmp_tid_from_gtid(gtid);
+      T id = tid;
       T small_chunk, extras;
 
       small_chunk = ntc / nproc;
@@ -832,7 +349,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
       }
       break;
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
                      "kmp_sch_static_balanced\n",
                      gtid));
       schedule = kmp_sch_static_balanced;
@@ -842,14 +359,15 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   } // case
 #endif
   case kmp_sch_static_balanced: {
-    T nproc = th->th.th_team_nproc;
     T init, limit;
 
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
-                   gtid));
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
 
     if (nproc > 1) {
-      T id = __kmp_tid_from_gtid(gtid);
+      T id = tid;
 
       if (tc < nproc) {
         if (id < tc) {
@@ -873,7 +391,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         init = 0;
         limit = tc - 1;
         pr->u.p.parm1 = TRUE;
-      } else { // zero trip count
+      } else {
+        // zero trip count
         pr->u.p.count = 1; /* means no more chunks to execute */
         pr->u.p.parm1 = FALSE;
         break;
@@ -882,7 +401,8 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 #if USE_ITT_BUILD
     // Calculate chunk for metadata report
     if (itt_need_metadata_reporting)
-      cur_chunk = limit - init + 1;
+      if (cur_chunk)
+        *cur_chunk = limit - init + 1;
 #endif
     if (st == 1) {
       pr->u.p.lb = lb + init;
@@ -899,16 +419,17 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
       }
     }
-    if (pr->ordered) {
+    if (pr->flags.ordered) {
       pr->u.p.ordered_lower = init;
       pr->u.p.ordered_upper = limit;
     }
     break;
   } // case
+#if OMP_45_ENABLED
   case kmp_sch_static_balanced_chunked: {
     // similar to balanced, but chunk adjusted to multiple of simd width
-    T nth = th->th.th_team_nproc;
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
+    T nth = nproc;
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
                    " -> falling-through to static_greedy\n",
                    gtid));
     schedule = kmp_sch_static_greedy;
@@ -918,12 +439,14 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
       pr->u.p.parm1 = tc;
     break;
   } // case
-  case kmp_sch_guided_iterative_chunked:
-  case kmp_sch_guided_simd: {
-    T nproc = th->th.th_team_nproc;
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
-                   " case\n",
-                   gtid));
+  case kmp_sch_guided_simd:
+#endif // OMP_45_ENABLED
+  case kmp_sch_guided_iterative_chunked: {
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
+         " case\n",
+         gtid));
 
     if (nproc > 1) {
       if ((2L * chunk + 1) * nproc >= tc) {
@@ -936,22 +459,24 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
             guided_flt_param / nproc; // may occupy parm3 and parm4
       }
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
                      "kmp_sch_static_greedy\n",
                      gtid));
       schedule = kmp_sch_static_greedy;
       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
-                     gtid));
+      KD_TRACE(
+          100,
+          ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+           gtid));
       pr->u.p.parm1 = tc;
     } // if
   } // case
   break;
   case kmp_sch_guided_analytical_chunked: {
-    T nproc = th->th.th_team_nproc;
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
-                   " case\n",
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
                    gtid));
+
     if (nproc > 1) {
       if ((2L * chunk + 1) * nproc >= tc) {
         /* chunk size too large, switch to dynamic */
@@ -1061,7 +586,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 #endif
       } // if
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
                      "kmp_sch_static_greedy\n",
                      gtid));
       schedule = kmp_sch_static_greedy;
@@ -1071,18 +596,18 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   } // case
   break;
   case kmp_sch_static_greedy:
-    KD_TRACE(100,
-             ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
-    pr->u.p.parm1 = (th->th.th_team_nproc > 1)
-                        ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
-                        : tc;
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
+         gtid));
+    pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
     break;
   case kmp_sch_static_chunked:
   case kmp_sch_dynamic_chunked:
     if (pr->u.p.parm1 <= 0) {
       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
     }
-    KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
+    KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
                    gtid));
     break;
@@ -1091,12 +616,13 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
 
     T parm1, parm2, parm3, parm4;
     KD_TRACE(100,
-             ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
+             ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
 
     parm1 = chunk;
 
     /* F : size of the first cycle */
-    parm2 = (tc / (2 * th->th.th_team_nproc));
+    parm2 = (tc / (2 * nproc));
 
     if (parm2 < 1) {
       parm2 = 1;
@@ -1142,8 +668,197 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   } break;
   } // switch
   pr->schedule = schedule;
-  if (active) {
-    /* The name of this buffer should be my_buffer_index when it's free to use
+}
+
+#if KMP_USE_HIER_SCHED
+template <typename T>
+inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
+                                             typename traits_t<T>::signed_t st);
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
+                                            kmp_int32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
+                                             kmp_uint32 ub, kmp_int32 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint32>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
+                                            kmp_int64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_int64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+template <>
+inline void
+__kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
+                                             kmp_uint64 ub, kmp_int64 st) {
+  __kmp_dispatch_init_hierarchy<kmp_uint64>(
+      loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
+      __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
+}
+
+// free all the hierarchy scheduling memory associated with the team
+void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
+  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
+  for (int i = 0; i < num_disp_buff; ++i) {
+    // type does not matter here so use kmp_int32
+    auto sh =
+        reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
+            &team->t.t_disp_buffer[i]);
+    if (sh->hier) {
+      sh->hier->deallocate();
+      __kmp_free(sh->hier);
+    }
+  }
+}
+#endif
+
+// UT - unsigned flavor of T, ST - signed flavor of T,
+// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
+template <typename T>
+static void
+__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
+                    T ub, typename traits_t<T>::signed_t st,
+                    typename traits_t<T>::signed_t chunk, int push_ws) {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::floating_t DBL;
+
+  int active;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  kmp_uint32 my_buffer_index;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+
+  KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
+                   sizeof(dispatch_private_info));
+  KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
+                   sizeof(dispatch_shared_info));
+
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+
+#if INCLUDE_SSC_MARKS
+  SSC_MARK_DISPATCH_INIT();
+#endif
+#ifdef KMP_DEBUG
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
+                            "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
+                            traits_t<ST>::spec, traits_t<T>::spec,
+                            traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  /* setup data */
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+
+#if KMP_USE_HIER_SCHED
+  // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
+  // Hierarchical scheduling does not work with ordered, so if ordered is
+  // detected, then revert back to threaded scheduling.
+  bool ordered;
+  enum sched_type my_sched = schedule;
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
+  if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
+    my_sched =
+        (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
+  ordered = (kmp_ord_lower & my_sched);
+  if (pr->flags.use_hier) {
+    if (ordered) {
+      KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
+                     "Disabling hierarchical scheduling.\n",
+                     gtid));
+      pr->flags.use_hier = FALSE;
+    }
+  }
+  if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
+    // Don't use hierarchical for ordered parallel loops and don't
+    // use the runtime hierarchy if one was specified in the program
+    if (!ordered && !pr->flags.use_hier)
+      __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
+  }
+#endif // KMP_USE_HIER_SCHED
+
+#if USE_ITT_BUILD
+  kmp_uint64 cur_chunk = chunk;
+  int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
+                                    __kmp_forkjoin_frames_mode == 3 &&
+                                    KMP_MASTER_GTID(gtid) &&
+#if OMP_40_ENABLED
+                                    th->th.th_teams_microtask == NULL &&
+#endif
+                                    team->t.t_active_level == 1;
+#endif
+  if (!active) {
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+  } else {
+    KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                     &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+    my_buffer_index = th->th.th_dispatch->th_disp_index++;
+
+    /* What happens when number of threads changes, need to resize buffer? */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        &th->th.th_dispatch
+             ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+        &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+    KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
+                  my_buffer_index));
+  }
+
+  __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
+#if USE_ITT_BUILD
+                                &cur_chunk,
+#endif
+                                chunk, (T)th->th.th_team_nproc,
+                                (T)th->th.th_info.ds.ds_tid);
+  if (active) {
+    if (pr->flags.ordered == 0) {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
+    } else {
+      th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
+      th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
+    }
+  }
+
+  // Any half-decent optimizer will remove this test when the blocks are empty
+  // since the macros expand to nothing
+  // when statistics are disabled.
+  if (schedule == __kmp_static) {
+    KMP_COUNT_BLOCK(OMP_FOR_static);
+    KMP_COUNT_VALUE(FOR_static_iterations, pr->u.p.tc);
+  } else {
+    KMP_COUNT_BLOCK(OMP_FOR_dynamic);
+    KMP_COUNT_VALUE(FOR_dynamic_iterations, pr->u.p.tc);
+  }
+
+  if (active) {
+    /* The name of this buffer should be my_buffer_index when it's free to use
      * it */
 
     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
@@ -1162,7 +877,7 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
     th->th.th_dispatch->th_dispatch_sh_current =
         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
 #if USE_ITT_BUILD
-    if (pr->ordered) {
+    if (pr->flags.ordered) {
       __kmp_itt_ordered_init(gtid);
     }
     // Report loop metadata
@@ -1181,7 +896,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         break;
       case kmp_sch_guided_iterative_chunked:
       case kmp_sch_guided_analytical_chunked:
+#if OMP_45_ENABLED
       case kmp_sch_guided_simd:
+#endif
         schedtype = 2;
         break;
       default:
@@ -1190,8 +907,14 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         schedtype = 3;
         break;
       }
-      __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
+      __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
     }
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier) {
+      pr->u.p.count = 0;
+      pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
+    }
+#endif // KMP_USER_HIER_SCHED
 #endif /* USE_ITT_BUILD */
   }
 
@@ -1208,10 +931,10 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
-    KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
-                  pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
-                  pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
-                  pr->u.p.parm3, pr->u.p.parm4));
+    KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
+                  pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
+                  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
+                  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
     __kmp_str_free(&buff);
   }
 #endif
@@ -1234,10 +957,9 @@ __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
   if (ompt_enabled.ompt_callback_work) {
     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
-    kmp_info_t *thr = __kmp_threads[gtid];
     ompt_callbacks.ompt_callback(ompt_callback_work)(
         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
-        &(task_info->task_data), tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
+        &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
   }
 #endif
 }
@@ -1389,115 +1111,799 @@ static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
 
 #endif /* KMP_GOMP_COMPAT */
 
-/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
-   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
-   is not called. */
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-#define OMPT_LOOP_END                                                          \
-  if (status == 0) {                                                           \
-    if (ompt_enabled.ompt_callback_work) {                                     \
-      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
-      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
-      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
-          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
-          &(task_info->task_data), 0, codeptr);                                \
-    }                                                                          \
-  }
-// TODO: implement count
-#else
-#define OMPT_LOOP_END // no-op
-#endif
-
 template <typename T>
-static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
-                               T *p_lb, T *p_ub,
-                               typename traits_t<T>::signed_t *p_st
-#if OMPT_SUPPORT && OMPT_OPTIONAL
-                               ,
-                               void *codeptr
-#endif
-                               ) {
-
+int __kmp_dispatch_next_algorithm(int gtid,
+                                  dispatch_private_info_template<T> *pr,
+                                  dispatch_shared_info_template<T> volatile *sh,
+                                  kmp_int32 *p_last, T *p_lb, T *p_ub,
+                                  typename traits_t<T>::signed_t *p_st, T nproc,
+                                  T tid) {
   typedef typename traits_t<T>::unsigned_t UT;
   typedef typename traits_t<T>::signed_t ST;
   typedef typename traits_t<T>::floating_t DBL;
-
-  // This is potentially slightly misleading, schedule(runtime) will appear here
-  // even if the actual runtme schedule is static. (Which points out a
-  // disadavantage of schedule(runtime): even when static scheduling is used it
-  // costs more than a compile time choice to use static scheduling would.)
-  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
-
-  int status;
-  dispatch_private_info_template<T> *pr;
+  int status = 0;
+  kmp_int32 last = 0;
+  T start;
+  ST incr;
+  UT limit, trip, init;
   kmp_info_t *th = __kmp_threads[gtid];
   kmp_team_t *team = th->th.th_team;
 
-  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
 #ifdef KMP_DEBUG
   {
     char *buff;
     // create format specifiers before the debug output
-    buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
-                            "p_ub:%%%s p_st:%%%s p_last: %%p\n",
-                            traits_t<T>::spec, traits_t<T>::spec,
-                            traits_t<ST>::spec);
-    KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
+    buff =
+        __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
+                         "sh:%%p nproc:%%%s tid:%%%s\n",
+                         traits_t<T>::spec, traits_t<T>::spec);
+    KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
     __kmp_str_free(&buff);
   }
 #endif
 
-  if (team->t.t_serialized) {
-    /* NOTE: serialize this dispatch becase we are not at the active level */
-    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
-        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
-    KMP_DEBUG_ASSERT(pr);
+  // zero trip count
+  if (pr->u.p.tc == 0) {
+    KD_TRACE(10,
+             ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
+              "zero status:%d\n",
+              gtid, status));
+    return 0;
+  }
 
-    if ((status = (pr->u.p.tc != 0)) == 0) {
+  switch (pr->schedule) {
+#if (KMP_STATIC_STEAL_ENABLED)
+  case kmp_sch_static_steal: {
+    T chunk = pr->u.p.parm1;
+
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
+              gtid));
+
+    trip = pr->u.p.tc - 1;
+
+    if (traits_t<T>::type_size > 4) {
+      // use lock for 8-byte and CAS for 4-byte induction
+      // variable. TODO (optional): check and use 16-byte CAS
+      kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
+      KMP_DEBUG_ASSERT(lck != NULL);
+      if (pr->u.p.count < (UT)pr->u.p.ub) {
+        __kmp_acquire_lock(lck, gtid);
+        // try to get own chunk of iterations
+        init = (pr->u.p.count)++;
+        status = (init < (UT)pr->u.p.ub);
+        __kmp_release_lock(lck, gtid);
+      } else {
+        status = 0; // no own chunks
+      }
+      if (!status) { // try to steal
+        kmp_info_t **other_threads = team->t.t_threads;
+        int while_limit = nproc; // nproc attempts to find a victim
+        int while_index = 0;
+        // TODO: algorithm of searching for a victim
+        // should be cleaned up and measured
+        while ((!status) && (while_limit != ++while_index)) {
+          T remaining;
+          T victimIdx = pr->u.p.parm4;
+          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+          dispatch_private_info_template<T> *victim =
+              reinterpret_cast<dispatch_private_info_template<T> *>(
+                  other_threads[victimIdx]
+                      ->th.th_dispatch->th_dispatch_pr_current);
+          while ((victim == NULL || victim == pr ||
+                  (*(volatile T *)&victim->u.p.static_steal_counter !=
+                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+                 oldVictimIdx != victimIdx) {
+            victimIdx = (victimIdx + 1) % nproc;
+            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+                other_threads[victimIdx]
+                    ->th.th_dispatch->th_dispatch_pr_current);
+          }
+          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+            continue; // try once more (nproc attempts in total)
+            // no victim is ready yet to participate in stealing
+            // because all victims are still in kmp_init_dispatch
+          }
+          if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
+            pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
+            continue; // not enough chunks to steal, goto next victim
+          }
+
+          lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
+          KMP_ASSERT(lck != NULL);
+          __kmp_acquire_lock(lck, gtid);
+          limit = victim->u.p.ub; // keep initial ub
+          if (victim->u.p.count >= limit ||
+              (remaining = limit - victim->u.p.count) < 2) {
+            __kmp_release_lock(lck, gtid);
+            pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
+            continue; // not enough chunks to steal
+          }
+          // stealing succeded, reduce victim's ub by 1/4 of undone chunks or
+          // by 1
+          if (remaining > 3) {
+            // steal 1/4 of remaining
+            KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
+            init = (victim->u.p.ub -= (remaining >> 2));
+          } else {
+            // steal 1 chunk of 2 or 3 remaining
+            KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
+            init = (victim->u.p.ub -= 1);
+          }
+          __kmp_release_lock(lck, gtid);
+
+          KMP_DEBUG_ASSERT(init + 1 <= limit);
+          pr->u.p.parm4 = victimIdx; // remember victim to steal from
+          status = 1;
+          while_index = 0;
+          // now update own count and ub with stolen range but init chunk
+          __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
+          pr->u.p.count = init + 1;
+          pr->u.p.ub = limit;
+          __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } else {
+      // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
+      typedef union {
+        struct {
+          UT count;
+          T ub;
+        } p;
+        kmp_int64 b;
+      } union_i4;
+      // All operations on 'count' or 'ub' must be combined atomically
+      // together.
+      {
+        union_i4 vold, vnew;
+        vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+        vnew = vold;
+        vnew.p.count++;
+        while (!KMP_COMPARE_AND_STORE_ACQ64(
+            (volatile kmp_int64 *)&pr->u.p.count,
+            *VOLATILE_CAST(kmp_int64 *) & vold.b,
+            *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+          KMP_CPU_PAUSE();
+          vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
+          vnew = vold;
+          vnew.p.count++;
+        }
+        vnew = vold;
+        init = vnew.p.count;
+        status = (init < (UT)vnew.p.ub);
+      }
+
+      if (!status) {
+        kmp_info_t **other_threads = team->t.t_threads;
+        int while_limit = nproc; // nproc attempts to find a victim
+        int while_index = 0;
+
+        // TODO: algorithm of searching for a victim
+        // should be cleaned up and measured
+        while ((!status) && (while_limit != ++while_index)) {
+          union_i4 vold, vnew;
+          kmp_int32 remaining;
+          T victimIdx = pr->u.p.parm4;
+          T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
+          dispatch_private_info_template<T> *victim =
+              reinterpret_cast<dispatch_private_info_template<T> *>(
+                  other_threads[victimIdx]
+                      ->th.th_dispatch->th_dispatch_pr_current);
+          while ((victim == NULL || victim == pr ||
+                  (*(volatile T *)&victim->u.p.static_steal_counter !=
+                   *(volatile T *)&pr->u.p.static_steal_counter)) &&
+                 oldVictimIdx != victimIdx) {
+            victimIdx = (victimIdx + 1) % nproc;
+            victim = reinterpret_cast<dispatch_private_info_template<T> *>(
+                other_threads[victimIdx]
+                    ->th.th_dispatch->th_dispatch_pr_current);
+          }
+          if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter !=
+                          *(volatile T *)&pr->u.p.static_steal_counter)) {
+            continue; // try once more (nproc attempts in total)
+            // no victim is ready yet to participate in stealing
+            // because all victims are still in kmp_init_dispatch
+          }
+          pr->u.p.parm4 = victimIdx; // new victim found
+          while (1) { // CAS loop if victim has enough chunks to steal
+            vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
+            vnew = vold;
+
+            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+            if (vnew.p.count >= (UT)vnew.p.ub ||
+                (remaining = vnew.p.ub - vnew.p.count) < 2) {
+              pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
+              break; // not enough chunks to steal, goto next victim
+            }
+            if (remaining > 3) {
+              vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
+            } else {
+              vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
+            }
+            KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
+            // TODO: Should this be acquire or release?
+            if (KMP_COMPARE_AND_STORE_ACQ64(
+                    (volatile kmp_int64 *)&victim->u.p.count,
+                    *VOLATILE_CAST(kmp_int64 *) & vold.b,
+                    *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
+              // stealing succedded
+              KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub);
+              status = 1;
+              while_index = 0;
+              // now update own count and ub
+              init = vnew.p.ub;
+              vold.p.count = init + 1;
+#if KMP_ARCH_X86
+              KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
+#else
+              *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
+#endif
+              break;
+            } // if (check CAS result)
+            KMP_CPU_PAUSE(); // CAS failed, repeate attempt
+          } // while (try to steal from particular victim)
+        } // while (search for victim)
+      } // if (try to find victim and steal)
+    } // if (4-byte induction variable)
+    if (!status) {
       *p_lb = 0;
       *p_ub = 0;
-      //            if ( p_last != NULL )
-      //                *p_last = 0;
       if (p_st != NULL)
         *p_st = 0;
-      if (__kmp_env_consistency_check) {
-        if (pr->pushed_ws != ct_none) {
-          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
-        }
+    } else {
+      start = pr->u.p.parm2;
+      init *= chunk;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+      KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
+
+      KMP_DEBUG_ASSERT(init <= trip);
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
       }
-    } else if (pr->nomerge) {
-      kmp_int32 last;
-      T start;
-      UT limit, trip, init;
-      ST incr;
-      T chunk = pr->u.p.parm1;
 
-      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
-                     gtid));
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+    break;
+  } // case
+#endif // ( KMP_STATIC_STEAL_ENABLED )
+  case kmp_sch_static_balanced: {
+    KD_TRACE(
+        10,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
+         gtid));
+    /* check if thread has any iteration to do */
+    if ((status = !pr->u.p.count) != 0) {
+      pr->u.p.count = 1;
+      *p_lb = pr->u.p.lb;
+      *p_ub = pr->u.p.ub;
+      last = pr->u.p.parm1;
+      if (p_st != NULL)
+        *p_st = pr->u.p.st;
+    } else { /* no iterations to do */
+      pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
+    }
+  } // case
+  break;
+  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
+                                 merged here */
+  case kmp_sch_static_chunked: {
+    T parm1;
 
-      init = chunk * pr->u.p.count++;
-      trip = pr->u.p.tc - 1;
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_static_[affinity|chunked] case\n",
+                   gtid));
+    parm1 = pr->u.p.parm1;
 
-      if ((status = (init <= trip)) == 0) {
-        *p_lb = 0;
-        *p_ub = 0;
-        //                if ( p_last != NULL )
-        //                    *p_last = 0;
-        if (p_st != NULL)
-          *p_st = 0;
-        if (__kmp_env_consistency_check) {
-          if (pr->pushed_ws != ct_none) {
-            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
-          }
-        }
+    trip = pr->u.p.tc - 1;
+    init = parm1 * (pr->u.p.count + tid);
+
+    if ((status = (init <= trip)) != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      limit = parm1 + init - 1;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      pr->u.p.count += nproc;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
       } else {
-        start = pr->u.p.lb;
-        limit = chunk + init - 1;
-        incr = pr->u.p.st;
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
 
-        if ((last = (limit >= trip)) != 0) {
-          limit = trip;
-#if KMP_OS_WINDOWS
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_dynamic_chunked: {
+    T chunk = pr->u.p.parm1;
+
+    KD_TRACE(
+        100,
+        ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
+         gtid));
+
+    init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+    trip = pr->u.p.tc - 1;
+
+    if ((status = (init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = chunk + init - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+
+  case kmp_sch_guided_iterative_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
+                   "iterative case\n",
+                   gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        // nothing to do, don't try atomic op
+        status = 0;
+        break;
+      }
+      if ((T)remaining <
+          pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
+        // use dynamic-style shcedule
+        // atomically inrement iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunkspec);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunkspec) {
+            limit = init + chunkspec - 1;
+          } else {
+            last = 1; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      limit = init +
+              (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+
+#if OMP_45_ENABLED
+  case kmp_sch_guided_simd: {
+    // same as iterative but curr-chunk adjusted to be multiple of given
+    // chunk
+    T chunk = pr->u.p.parm1;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
+              gtid));
+    trip = pr->u.p.tc;
+    // Start atomic part of calculations
+    while (1) {
+      ST remaining; // signed, because can be < 0
+      init = sh->u.s.iteration; // shared value
+      remaining = trip - init;
+      if (remaining <= 0) { // AC: need to compare with 0 first
+        status = 0; // nothing to do, don't try atomic op
+        break;
+      }
+      KMP_DEBUG_ASSERT(init % chunk == 0);
+      // compare with K*nproc*(chunk+1), K=2 by default
+      if ((T)remaining < pr->u.p.parm2) {
+        // use dynamic-style shcedule
+        // atomically inrement iterations, get old value
+        init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                                 (ST)chunk);
+        remaining = trip - init;
+        if (remaining <= 0) {
+          status = 0; // all iterations got by other threads
+        } else {
+          // got some iterations to work on
+          status = 1;
+          if ((T)remaining > chunk) {
+            limit = init + chunk - 1;
+          } else {
+            last = 1; // the last chunk
+            limit = init + remaining - 1;
+          } // if
+        } // if
+        break;
+      } // if
+      // divide by K*nproc
+      UT span = remaining * (*(double *)&pr->u.p.parm3);
+      UT rem = span % chunk;
+      if (rem) // adjust so that span%chunk == 0
+        span += chunk - rem;
+      limit = init + span;
+      if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
+                               (ST)init, (ST)limit)) {
+        // CAS was successful, chunk obtained
+        status = 1;
+        --limit;
+        break;
+      } // if
+    } // while
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } // if
+  } // case
+  break;
+#endif // OMP_45_ENABLED
+
+  case kmp_sch_guided_analytical_chunked: {
+    T chunkspec = pr->u.p.parm1;
+    UT chunkIdx;
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+    /* for storing original FPCW value for Windows* OS on
+       IA-32 architecture 8-byte version */
+    unsigned int oldFpcw;
+    unsigned int fpcwSet = 0;
+#endif
+    KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
+                   "kmp_sch_guided_analytical_chunked case\n",
+                   gtid));
+
+    trip = pr->u.p.tc;
+
+    KMP_DEBUG_ASSERT(nproc > 1);
+    KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
+
+    while (1) { /* this while loop is a safeguard against unexpected zero
+                   chunk sizes */
+      chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
+      if (chunkIdx >= (UT)pr->u.p.parm2) {
+        --trip;
+        /* use dynamic-style scheduling */
+        init = chunkIdx * chunkspec + pr->u.p.count;
+        /* need to verify init > 0 in case of overflow in the above
+         * calculation */
+        if ((status = (init > 0 && init <= trip)) != 0) {
+          limit = init + chunkspec - 1;
+
+          if ((last = (limit >= trip)) != 0)
+            limit = trip;
+        }
+        break;
+      } else {
+/* use exponential-style scheduling */
+/* The following check is to workaround the lack of long double precision on
+   Windows* OS.
+   This check works around the possible effect that init != 0 for chunkIdx == 0.
+ */
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+        /* If we haven't already done so, save original
+           FPCW and set precision to 64-bit, as Windows* OS
+           on IA-32 architecture defaults to 53-bit */
+        if (!fpcwSet) {
+          oldFpcw = _control87(0, 0);
+          _control87(_PC_64, _MCW_PC);
+          fpcwSet = 0x30000;
+        }
+#endif
+        if (chunkIdx) {
+          init = __kmp_dispatch_guided_remaining<T>(
+              trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
+          KMP_DEBUG_ASSERT(init);
+          init = trip - init;
+        } else
+          init = 0;
+        limit = trip - __kmp_dispatch_guided_remaining<T>(
+                           trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
+        KMP_ASSERT(init <= limit);
+        if (init < limit) {
+          KMP_DEBUG_ASSERT(limit <= trip);
+          --limit;
+          status = 1;
+          break;
+        } // if
+      } // if
+    } // while (1)
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+    /* restore FPCW if necessary
+       AC: check fpcwSet flag first because oldFpcw can be uninitialized here
+    */
+    if (fpcwSet && (oldFpcw & fpcwSet))
+      _control87(oldFpcw, _MCW_PC);
+#endif
+    if (status != 0) {
+      start = pr->u.p.lb;
+      incr = pr->u.p.st;
+      if (p_st != NULL)
+        *p_st = incr;
+      *p_lb = start + init * incr;
+      *p_ub = start + limit * incr;
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      }
+    } else {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    }
+  } // case
+  break;
+
+  case kmp_sch_trapezoidal: {
+    UT index;
+    T parm2 = pr->u.p.parm2;
+    T parm3 = pr->u.p.parm3;
+    T parm4 = pr->u.p.parm4;
+    KD_TRACE(100,
+             ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
+              gtid));
+
+    index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
+
+    init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
+    trip = pr->u.p.tc - 1;
+
+    if ((status = ((T)index < parm3 && init <= trip)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+    } else {
+      start = pr->u.p.lb;
+      limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
+      incr = pr->u.p.st;
+
+      if ((last = (limit >= trip)) != 0)
+        limit = trip;
+
+      if (p_st != NULL)
+        *p_st = incr;
+
+      if (incr == 1) {
+        *p_lb = start + init;
+        *p_ub = start + limit;
+      } else {
+        *p_lb = start + init * incr;
+        *p_ub = start + limit * incr;
+      }
+
+      if (pr->flags.ordered) {
+        pr->u.p.ordered_lower = init;
+        pr->u.p.ordered_upper = limit;
+      } // if
+    } // if
+  } // case
+  break;
+  default: {
+    status = 0; // to avoid complaints on uninitialized variable use
+    __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
+                KMP_HNT(GetNewerLibrary), // Hint
+                __kmp_msg_null // Variadic argument list terminator
+                );
+  } break;
+  } // switch
+  if (p_last)
+    *p_last = last;
+#ifdef KMP_DEBUG
+  if (pr->flags.ordered) {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
+                            "ordered_lower:%%%s ordered_upper:%%%s\n",
+                            traits_t<UT>::spec, traits_t<UT>::spec);
+    KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
+    __kmp_str_free(&buff);
+  }
+  {
+    char *buff;
+    // create format specifiers before the debug output
+    buff = __kmp_str_format(
+        "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
+        traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
+    KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
+    __kmp_str_free(&buff);
+  }
+#endif
+  return status;
+}
+
+/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
+   work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
+   is not called. */
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+#define OMPT_LOOP_END                                                          \
+  if (status == 0) {                                                           \
+    if (ompt_enabled.ompt_callback_work) {                                     \
+      ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
+      ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
+      ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
+          ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
+          &(task_info->task_data), 0, codeptr);                                \
+    }                                                                          \
+  }
+// TODO: implement count
+#else
+#define OMPT_LOOP_END // no-op
+#endif
+
+template <typename T>
+static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
+                               T *p_lb, T *p_ub,
+                               typename traits_t<T>::signed_t *p_st
+#if OMPT_SUPPORT && OMPT_OPTIONAL
+                               ,
+                               void *codeptr
+#endif
+                               ) {
+
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::floating_t DBL;
+  // This is potentially slightly misleading, schedule(runtime) will appear here
+  // even if the actual runtme schedule is static. (Which points out a
+  // disadavantage of schedule(runtime): even when static scheduling is used it
+  // costs more than a compile time choice to use static scheduling would.)
+  KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
+
+  int status;
+  dispatch_private_info_template<T> *pr;
+  kmp_info_t *th = __kmp_threads[gtid];
+  kmp_team_t *team = th->th.th_team;
+
+  KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
+  KD_TRACE(
+      1000,
+      ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
+       gtid, p_lb, p_ub, p_st, p_last));
+
+  if (team->t.t_serialized) {
+    /* NOTE: serialize this dispatch becase we are not at the active level */
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer); /* top of the stack */
+    KMP_DEBUG_ASSERT(pr);
+
+    if ((status = (pr->u.p.tc != 0)) == 0) {
+      *p_lb = 0;
+      *p_ub = 0;
+      //            if ( p_last != NULL )
+      //                *p_last = 0;
+      if (p_st != NULL)
+        *p_st = 0;
+      if (__kmp_env_consistency_check) {
+        if (pr->pushed_ws != ct_none) {
+          pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+        }
+      }
+    } else if (pr->flags.nomerge) {
+      kmp_int32 last;
+      T start;
+      UT limit, trip, init;
+      ST incr;
+      T chunk = pr->u.p.parm1;
+
+      KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
+                     gtid));
+
+      init = chunk * pr->u.p.count++;
+      trip = pr->u.p.tc - 1;
+
+      if ((status = (init <= trip)) == 0) {
+        *p_lb = 0;
+        *p_ub = 0;
+        //                if ( p_last != NULL )
+        //                    *p_last = 0;
+        if (p_st != NULL)
+          *p_st = 0;
+        if (__kmp_env_consistency_check) {
+          if (pr->pushed_ws != ct_none) {
+            pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
+          }
+        }
+      } else {
+        start = pr->u.p.lb;
+        limit = chunk + init - 1;
+        incr = pr->u.p.st;
+
+        if ((last = (limit >= trip)) != 0) {
+          limit = trip;
+#if KMP_OS_WINDOWS
           pr->u.p.last_upper = pr->u.p.ub;
 #endif /* KMP_OS_WINDOWS */
         }
@@ -1513,7 +1919,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
           *p_ub = start + limit * incr;
         }
 
-        if (pr->ordered) {
+        if (pr->flags.ordered) {
           pr->u.p.ordered_lower = init;
           pr->u.p.ordered_upper = limit;
 #ifdef KMP_DEBUG
@@ -1561,10 +1967,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
     return status;
   } else {
     kmp_int32 last = 0;
-    dispatch_shared_info_template<UT> *sh;
-    T start;
-    ST incr;
-    UT limit, trip, init;
+    dispatch_shared_info_template<T> volatile *sh;
 
     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
@@ -1572,735 +1975,19 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
         th->th.th_dispatch->th_dispatch_pr_current);
     KMP_DEBUG_ASSERT(pr);
-    sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
+    sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
         th->th.th_dispatch->th_dispatch_sh_current);
     KMP_DEBUG_ASSERT(sh);
 
-    if (pr->u.p.tc == 0) {
-      // zero trip count
-      status = 0;
-    } else {
-      switch (pr->schedule) {
-#if (KMP_STATIC_STEAL_ENABLED)
-      case kmp_sch_static_steal: {
-        T chunk = pr->u.p.parm1;
-        int nproc = th->th.th_team_nproc;
-
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
-                       gtid));
-
-        trip = pr->u.p.tc - 1;
-
-        if (traits_t<T>::type_size > 4) {
-          // use lock for 8-byte and CAS for 4-byte induction
-          // variable. TODO (optional): check and use 16-byte CAS
-          kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
-          KMP_DEBUG_ASSERT(lck != NULL);
-          if (pr->u.p.count < (UT)pr->u.p.ub) {
-            __kmp_acquire_lock(lck, gtid);
-            // try to get own chunk of iterations
-            init = (pr->u.p.count)++;
-            status = (init < (UT)pr->u.p.ub);
-            __kmp_release_lock(lck, gtid);
-          } else {
-            status = 0; // no own chunks
-          }
-          if (!status) { // try to steal
-            kmp_info_t **other_threads = team->t.t_threads;
-            int while_limit = nproc; // nproc attempts to find a victim
-            int while_index = 0;
-            // TODO: algorithm of searching for a victim
-            // should be cleaned up and measured
-            while ((!status) && (while_limit != ++while_index)) {
-              T remaining;
-              T victimIdx = pr->u.p.parm4;
-              T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-              dispatch_private_info_template<T> *victim =
-                  reinterpret_cast<dispatch_private_info_template<T> *>(
-                      other_threads[victimIdx]
-                          ->th.th_dispatch->th_dispatch_pr_current);
-              while ((victim == NULL || victim == pr ||
-                      (*(volatile T *)&victim->u.p.static_steal_counter !=
-                       *(volatile T *)&pr->u.p.static_steal_counter)) &&
-                     oldVictimIdx != victimIdx) {
-                victimIdx = (victimIdx + 1) % nproc;
-                victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                    other_threads[victimIdx]
-                        ->th.th_dispatch->th_dispatch_pr_current);
-              }
-              if (!victim ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) {
-                continue; // try once more (nproc attempts in total)
-                // no victim is ready yet to participate in stealing
-                // because all victims are still in kmp_init_dispatch
-              }
-              if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
-                pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
-                continue; // not enough chunks to steal, goto next victim
-              }
-
-              lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
-              KMP_ASSERT(lck != NULL);
-              __kmp_acquire_lock(lck, gtid);
-              limit = victim->u.p.ub; // keep initial ub
-              if (victim->u.p.count >= limit ||
-                  (remaining = limit - victim->u.p.count) < 2) {
-                __kmp_release_lock(lck, gtid);
-                pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
-                continue; // not enough chunks to steal
-              }
-              // stealing succeded, reduce victim's ub by 1/4 of undone chunks
-              // or by 1
-              if (remaining > 3) {
-                KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
-                init = (victim->u.p.ub -=
-                        (remaining >> 2)); // steal 1/4 of remaining
-              } else {
-                KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
-                init =
-                    (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
-              }
-              __kmp_release_lock(lck, gtid);
-
-              KMP_DEBUG_ASSERT(init + 1 <= limit);
-              pr->u.p.parm4 = victimIdx; // remember victim to steal from
-              status = 1;
-              while_index = 0;
-              // now update own count and ub with stolen range but init chunk
-              __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
-              pr->u.p.count = init + 1;
-              pr->u.p.ub = limit;
-              __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
-            } // while (search for victim)
-          } // if (try to find victim and steal)
-        } else {
-          // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
-          typedef union {
-            struct {
-              UT count;
-              T ub;
-            } p;
-            kmp_int64 b;
-          } union_i4;
-          // All operations on 'count' or 'ub' must be combined atomically
-          // together.
-          {
-            union_i4 vold, vnew;
-            vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-            vnew = vold;
-            vnew.p.count++;
-            while (!KMP_COMPARE_AND_STORE_ACQ64(
-                (volatile kmp_int64 *)&pr->u.p.count,
-                *VOLATILE_CAST(kmp_int64 *) & vold.b,
-                *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-              KMP_CPU_PAUSE();
-              vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
-              vnew = vold;
-              vnew.p.count++;
-            }
-            vnew = vold;
-            init = vnew.p.count;
-            status = (init < (UT)vnew.p.ub);
-          }
-
-          if (!status) {
-            kmp_info_t **other_threads = team->t.t_threads;
-            int while_limit = nproc; // nproc attempts to find a victim
-            int while_index = 0;
-
-            // TODO: algorithm of searching for a victim
-            // should be cleaned up and measured
-            while ((!status) && (while_limit != ++while_index)) {
-              union_i4 vold, vnew;
-              kmp_int32 remaining;
-              T victimIdx = pr->u.p.parm4;
-              T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
-              dispatch_private_info_template<T> *victim =
-                  reinterpret_cast<dispatch_private_info_template<T> *>(
-                      other_threads[victimIdx]
-                          ->th.th_dispatch->th_dispatch_pr_current);
-              while ((victim == NULL || victim == pr ||
-                      (*(volatile T *)&victim->u.p.static_steal_counter !=
-                       *(volatile T *)&pr->u.p.static_steal_counter)) &&
-                     oldVictimIdx != victimIdx) {
-                victimIdx = (victimIdx + 1) % nproc;
-                victim = reinterpret_cast<dispatch_private_info_template<T> *>(
-                    other_threads[victimIdx]
-                        ->th.th_dispatch->th_dispatch_pr_current);
-              }
-              if (!victim ||
-                  (*(volatile T *)&victim->u.p.static_steal_counter !=
-                   *(volatile T *)&pr->u.p.static_steal_counter)) {
-                continue; // try once more (nproc attempts in total)
-                // no victim is ready yet to participate in stealing
-                // because all victims are still in kmp_init_dispatch
-              }
-              pr->u.p.parm4 = victimIdx; // new victim found
-              while (1) { // CAS loop if victim has enough chunks to steal
-                vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
-                vnew = vold;
-
-                KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-                if (vnew.p.count >= (UT)vnew.p.ub ||
-                    (remaining = vnew.p.ub - vnew.p.count) < 2) {
-                  pr->u.p.parm4 =
-                      (victimIdx + 1) % nproc; // shift start victim id
-                  break; // not enough chunks to steal, goto next victim
-                }
-                if (remaining > 3) {
-                  vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
-                } else {
-                  vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
-                }
-                KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
-                // TODO: Should this be acquire or release?
-                if (KMP_COMPARE_AND_STORE_ACQ64(
-                        (volatile kmp_int64 *)&victim->u.p.count,
-                        *VOLATILE_CAST(kmp_int64 *) & vold.b,
-                        *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
-                  // stealing succeeded
-                  KMP_COUNT_VALUE(FOR_static_steal_stolen,
-                                  vold.p.ub - vnew.p.ub);
-                  status = 1;
-                  while_index = 0;
-                  // now update own count and ub
-                  init = vnew.p.ub;
-                  vold.p.count = init + 1;
-#if KMP_ARCH_X86
-                  KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
-                                   vold.b);
-#else
-                  *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
-#endif
-                  break;
-                } // if (check CAS result)
-                KMP_CPU_PAUSE(); // CAS failed, repeat attempt
-              } // while (try to steal from particular victim)
-            } // while (search for victim)
-          } // if (try to find victim and steal)
-        } // if (4-byte induction variable)
-        if (!status) {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } else {
-          start = pr->u.p.parm2;
-          init *= chunk;
-          limit = chunk + init - 1;
-          incr = pr->u.p.st;
-          KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
-
-          KMP_DEBUG_ASSERT(init <= trip);
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-          if (p_st != NULL)
-            *p_st = incr;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-        break;
-      } // case
-#endif // ( KMP_STATIC_STEAL_ENABLED )
-      case kmp_sch_static_balanced: {
-        KD_TRACE(
-            100,
-            ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
-        if ((status = !pr->u.p.count) !=
-            0) { /* check if thread has any iteration to do */
-          pr->u.p.count = 1;
-          *p_lb = pr->u.p.lb;
-          *p_ub = pr->u.p.ub;
-          last = pr->u.p.parm1;
-          if (p_st != NULL)
-            *p_st = pr->u.p.st;
-        } else { /* no iterations to do */
-          pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
-        }
-        if (pr->ordered) {
-#ifdef KMP_DEBUG
-          {
-            char *buff;
-            // create format specifiers before the debug output
-            buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                    "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                    traits_t<UT>::spec, traits_t<UT>::spec);
-            KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                            pr->u.p.ordered_upper));
-            __kmp_str_free(&buff);
-          }
-#endif
-        } // if
-      } // case
-      break;
-      case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
-                                     merged here */
-      case kmp_sch_static_chunked: {
-        T parm1;
-
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
-                       "kmp_sch_static_[affinity|chunked] case\n",
-                       gtid));
-        parm1 = pr->u.p.parm1;
-
-        trip = pr->u.p.tc - 1;
-        init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
-
-        if ((status = (init <= trip)) != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          limit = parm1 + init - 1;
-
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-
-          if (p_st != NULL)
-            *p_st = incr;
-
-          pr->u.p.count += th->th.th_team_nproc;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_dynamic_chunked: {
-        T chunk = pr->u.p.parm1;
-
-        KD_TRACE(
-            100,
-            ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
-
-        init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
-        trip = pr->u.p.tc - 1;
-
-        if ((status = (init <= trip)) == 0) {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } else {
-          start = pr->u.p.lb;
-          limit = chunk + init - 1;
-          incr = pr->u.p.st;
-
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-
-          if (p_st != NULL)
-            *p_st = incr;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_guided_iterative_chunked: {
-        T chunkspec = pr->u.p.parm1;
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
-                       "iterative case\n",
-                       gtid));
-        trip = pr->u.p.tc;
-        // Start atomic part of calculations
-        while (1) {
-          ST remaining; // signed, because can be < 0
-          init = sh->u.s.iteration; // shared value
-          remaining = trip - init;
-          if (remaining <= 0) { // AC: need to compare with 0 first
-            // nothing to do, don't try atomic op
-            status = 0;
-            break;
-          }
-          if ((T)remaining <
-              pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
-            // use dynamic-style shcedule
-            // atomically inrement iterations, get old value
-            init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                     (ST)chunkspec);
-            remaining = trip - init;
-            if (remaining <= 0) {
-              status = 0; // all iterations got by other threads
-            } else { // got some iterations to work on
-              status = 1;
-              if ((T)remaining > chunkspec) {
-                limit = init + chunkspec - 1;
-              } else {
-                last = 1; // the last chunk
-                limit = init + remaining - 1;
-              } // if
-            } // if
-            break;
-          } // if
-          limit = init + (UT)(remaining *
-                              *(double *)&pr->u.p.parm3); // divide by K*nproc
-          if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                   (ST)init, (ST)limit)) {
-            // CAS was successful, chunk obtained
-            status = 1;
-            --limit;
-            break;
-          } // if
-        } // while
-        if (status != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          if (p_st != NULL)
-            *p_st = incr;
-          *p_lb = start + init * incr;
-          *p_ub = start + limit * incr;
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } else {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_guided_simd: {
-        // same as iterative but curr-chunk adjusted to be multiple of given
-        // chunk
-        T chunk = pr->u.p.parm1;
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
-                       gtid));
-        trip = pr->u.p.tc;
-        // Start atomic part of calculations
-        while (1) {
-          ST remaining; // signed, because can be < 0
-          init = sh->u.s.iteration; // shared value
-          remaining = trip - init;
-          if (remaining <= 0) { // AC: need to compare with 0 first
-            status = 0; // nothing to do, don't try atomic op
-            break;
-          }
-          KMP_DEBUG_ASSERT(init % chunk == 0);
-          // compare with K*nproc*(chunk+1), K=2 by default
-          if ((T)remaining < pr->u.p.parm2) {
-            // use dynamic-style shcedule
-            // atomically inrement iterations, get old value
-            init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                     (ST)chunk);
-            remaining = trip - init;
-            if (remaining <= 0) {
-              status = 0; // all iterations got by other threads
-            } else {
-              // got some iterations to work on
-              status = 1;
-              if ((T)remaining > chunk) {
-                limit = init + chunk - 1;
-              } else {
-                last = 1; // the last chunk
-                limit = init + remaining - 1;
-              } // if
-            } // if
-            break;
-          } // if
-          // divide by K*nproc
-          UT span = remaining * (*(double *)&pr->u.p.parm3);
-          UT rem = span % chunk;
-          if (rem) // adjust so that span%chunk == 0
-            span += chunk - rem;
-          limit = init + span;
-          if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
-                                   (ST)init, (ST)limit)) {
-            // CAS was successful, chunk obtained
-            status = 1;
-            --limit;
-            break;
-          } // if
-        } // while
-        if (status != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          if (p_st != NULL)
-            *p_st = incr;
-          *p_lb = start + init * incr;
-          *p_ub = start + limit * incr;
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } else {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } // if
-      } // case
-      break;
-
-      case kmp_sch_guided_analytical_chunked: {
-        T chunkspec = pr->u.p.parm1;
-        UT chunkIdx;
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
-        /* for storing original FPCW value for Windows* OS on
-           IA-32 architecture 8-byte version */
-        unsigned int oldFpcw;
-        unsigned int fpcwSet = 0;
-#endif
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
-                       "analytical case\n",
-                       gtid));
-
-        trip = pr->u.p.tc;
-
-        KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
-        KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
-                         trip);
-
-        while (1) { /* this while loop is a safeguard against unexpected zero
-                       chunk sizes */
-          chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
-          if (chunkIdx >= (UT)pr->u.p.parm2) {
-            --trip;
-            /* use dynamic-style scheduling */
-            init = chunkIdx * chunkspec + pr->u.p.count;
-            /* need to verify init > 0 in case of overflow in the above
-             * calculation */
-            if ((status = (init > 0 && init <= trip)) != 0) {
-              limit = init + chunkspec - 1;
-
-              if ((last = (limit >= trip)) != 0)
-                limit = trip;
-            }
-            break;
-          } else {
-/* use exponential-style scheduling */
-/* The following check is to workaround the lack of long double precision on
-   Windows* OS.
-   This check works around the possible effect that init != 0 for chunkIdx == 0.
- */
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
-            /* If we haven't already done so, save original FPCW and set
-               precision to 64-bit, as Windows* OS on IA-32 architecture
-               defaults to 53-bit */
-            if (!fpcwSet) {
-              oldFpcw = _control87(0, 0);
-              _control87(_PC_64, _MCW_PC);
-              fpcwSet = 0x30000;
-            }
-#endif
-            if (chunkIdx) {
-              init = __kmp_dispatch_guided_remaining<T>(
-                  trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
-              KMP_DEBUG_ASSERT(init);
-              init = trip - init;
-            } else
-              init = 0;
-            limit = trip - __kmp_dispatch_guided_remaining<T>(
-                               trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
-            KMP_ASSERT(init <= limit);
-            if (init < limit) {
-              KMP_DEBUG_ASSERT(limit <= trip);
-              --limit;
-              status = 1;
-              break;
-            } // if
-          } // if
-        } // while (1)
-#if KMP_OS_WINDOWS && KMP_ARCH_X86
-        /* restore FPCW if necessary
-           AC: check fpcwSet flag first because oldFpcw can be uninitialized
-           here */
-        if (fpcwSet && (oldFpcw & fpcwSet))
-          _control87(oldFpcw, _MCW_PC);
-#endif
-        if (status != 0) {
-          start = pr->u.p.lb;
-          incr = pr->u.p.st;
-          if (p_st != NULL)
-            *p_st = incr;
-          *p_lb = start + init * incr;
-          *p_ub = start + limit * incr;
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          }
-        } else {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        }
-      } // case
-      break;
-
-      case kmp_sch_trapezoidal: {
-        UT index;
-        T parm2 = pr->u.p.parm2;
-        T parm3 = pr->u.p.parm3;
-        T parm4 = pr->u.p.parm4;
-        KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
-                       gtid));
-
-        index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
-
-        init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
-        trip = pr->u.p.tc - 1;
-
-        if ((status = ((T)index < parm3 && init <= trip)) == 0) {
-          *p_lb = 0;
-          *p_ub = 0;
-          if (p_st != NULL)
-            *p_st = 0;
-        } else {
-          start = pr->u.p.lb;
-          limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
-          incr = pr->u.p.st;
-
-          if ((last = (limit >= trip)) != 0)
-            limit = trip;
-
-          if (p_st != NULL)
-            *p_st = incr;
-
-          if (incr == 1) {
-            *p_lb = start + init;
-            *p_ub = start + limit;
-          } else {
-            *p_lb = start + init * incr;
-            *p_ub = start + limit * incr;
-          }
-
-          if (pr->ordered) {
-            pr->u.p.ordered_lower = init;
-            pr->u.p.ordered_upper = limit;
-#ifdef KMP_DEBUG
-            {
-              char *buff;
-              // create format specifiers before the debug output
-              buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
-                                      "ordered_lower:%%%s ordered_upper:%%%s\n",
-                                      traits_t<UT>::spec, traits_t<UT>::spec);
-              KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
-                              pr->u.p.ordered_upper));
-              __kmp_str_free(&buff);
-            }
-#endif
-          } // if
-        } // if
-      } // case
-      break;
-      default: {
-        status = 0; // to avoid complaints on uninitialized variable use
-        __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
-                    KMP_HNT(GetNewerLibrary), // Hint
-                    __kmp_msg_null // Variadic argument list terminator
-                    );
-      } break;
-      } // switch
-    } // if tc == 0;
-
+#if KMP_USE_HIER_SCHED
+    if (pr->flags.use_hier)
+      status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
+    else
+#endif // KMP_USE_HIER_SCHED
+      status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
+                                                p_st, th->th.th_team_nproc,
+                                                th->th.th_info.ds.ds_tid);
+    // status == 0: no more iterations to execute
     if (status == 0) {
       UT num_done;
 
@@ -2312,11 +1999,14 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
         buff = __kmp_str_format(
             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
             traits_t<UT>::spec);
-        KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
+        KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
         __kmp_str_free(&buff);
       }
 #endif
 
+#if KMP_USE_HIER_SCHED
+      pr->flags.use_hier = FALSE;
+#endif
       if ((ST)num_done == th->th.th_team_nproc - 1) {
 #if (KMP_STATIC_STEAL_ENABLED)
         if (pr->schedule == kmp_sch_static_steal &&
@@ -2341,7 +2031,7 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
         sh->u.s.iteration = 0;
 
         /* TODO replace with general release procedure? */
-        if (pr->ordered) {
+        if (pr->flags.ordered) {
           sh->u.s.ordered_iteration = 0;
         }
 
@@ -2380,9 +2070,10 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
     // create format specifiers before the debug output
     buff = __kmp_str_format(
         "__kmp_dispatch_next: T#%%d normal case: "
-        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p  returning:%%d\n",
+        "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
-    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
+    KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
+                  (p_last ? *p_last : 0), status));
     __kmp_str_free(&buff);
   }
 #endif
diff --git a/runtime/src/kmp_dispatch.h b/runtime/src/kmp_dispatch.h
new file mode 100644
index 000000000..aadf29594
--- /dev/null
+++ b/runtime/src/kmp_dispatch.h
@@ -0,0 +1,522 @@
+/*
+ * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef KMP_DISPATCH_H
+#define KMP_DISPATCH_H
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+// Need to raise Win version from XP to Vista here for support of
+// InterlockedExchange64
+#if defined(_WIN32_WINNT) && defined(_M_IX86)
+#undef _WIN32_WINNT
+#define _WIN32_WINNT 0x0502
+#endif
+
+#include "kmp.h"
+#include "kmp_error.h"
+#include "kmp_i18n.h"
+#include "kmp_itt.h"
+#include "kmp_stats.h"
+#include "kmp_str.h"
+#if KMP_OS_WINDOWS && KMP_ARCH_X86
+#include <float.h>
+#endif
+
+#if OMPT_SUPPORT
+#include "ompt-internal.h"
+#include "ompt-specific.h"
+#endif
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+#if KMP_USE_HIER_SCHED
+// Forward declarations of some hierarchical scheduling data structures
+template <typename T> struct kmp_hier_t;
+template <typename T> struct kmp_hier_top_unit_t;
+#endif // KMP_USE_HIER_SCHED
+
+template <typename T> struct dispatch_shared_info_template;
+template <typename T> struct dispatch_private_info_template;
+
+template <typename T>
+extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
+                                          dispatch_private_info_template<T> *pr,
+                                          enum sched_type schedule, T lb, T ub,
+                                          typename traits_t<T>::signed_t st,
+#if USE_ITT_BUILD
+                                          kmp_uint64 *cur_chunk,
+#endif
+                                          typename traits_t<T>::signed_t chunk,
+                                          T nproc, T unit_id);
+template <typename T>
+extern int __kmp_dispatch_next_algorithm(
+    int gtid, dispatch_private_info_template<T> *pr,
+    dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
+    T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
+
+void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
+
+#if KMP_STATIC_STEAL_ENABLED
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  UT count; // unsigned
+  T ub;
+  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
+  T lb;
+  ST st; // signed
+  UT tc; // unsigned
+  T static_steal_counter; // for static_steal only; maybe better to put after ub
+
+  /* parm[1-4] are used in different ways by different scheduling algorithms */
+
+  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
+  //    a) parm3 is properly aligned and
+  //    b) all parm1-4 are in the same cache line.
+  // Because of parm1-4 are used together, performance seems to be better
+  // if they are in the same line (not measured though).
+
+  struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
+    T parm1;
+    T parm2;
+    T parm3;
+    T parm4;
+  };
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+
+#else /* KMP_STATIC_STEAL_ENABLED */
+
+// replaces dispatch_private_info{32,64} structures and
+// dispatch_private_info{32,64}_t types
+template <typename T> struct dispatch_private_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+  T lb;
+  T ub;
+  ST st; // signed
+  UT tc; // unsigned
+
+  T parm1;
+  T parm2;
+  T parm3;
+  T parm4;
+
+  UT count; // unsigned
+
+  UT ordered_lower; // unsigned
+  UT ordered_upper; // unsigned
+#if KMP_OS_WINDOWS
+  T last_upper;
+#endif /* KMP_OS_WINDOWS */
+};
+#endif /* KMP_STATIC_STEAL_ENABLED */
+
+template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
+  // duplicate alignment here, otherwise size of structure is not correct in our
+  // compiler
+  union KMP_ALIGN_CACHE private_info_tmpl {
+    dispatch_private_infoXX_template<T> p;
+    dispatch_private_info64_t p64;
+  } u;
+  enum sched_type schedule; /* scheduling algorithm */
+  kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
+  kmp_uint32 ordered_bumped;
+  // to retain the structure size after making order
+  kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
+  dispatch_private_info *next; /* stack of buffers for nest of serial regions */
+  kmp_uint32 type_size;
+#if KMP_USE_HIER_SCHED
+  kmp_int32 hier_id;
+  kmp_hier_top_unit_t<T> *hier_parent;
+  // member functions
+  kmp_int32 get_hier_id() const { return hier_id; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+#endif
+  enum cons_type pushed_ws;
+};
+
+// replaces dispatch_shared_info{32,64} structures and
+// dispatch_shared_info{32,64}_t types
+template <typename T> struct dispatch_shared_infoXX_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  /* chunk index under dynamic, number of idle threads under static-steal;
+     iteration index otherwise */
+  volatile UT iteration;
+  volatile UT num_done;
+  volatile UT ordered_iteration;
+  // to retain the structure size making ordered_iteration scalar
+  UT ordered_dummy[KMP_MAX_ORDERED - 3];
+};
+
+// replaces dispatch_shared_info structure and dispatch_shared_info_t type
+template <typename T> struct dispatch_shared_info_template {
+  typedef typename traits_t<T>::unsigned_t UT;
+  // we need union here to keep the structure size
+  union shared_info_tmpl {
+    dispatch_shared_infoXX_template<UT> s;
+    dispatch_shared_info64_t s64;
+  } u;
+  volatile kmp_uint32 buffer_index;
+#if OMP_45_ENABLED
+  volatile kmp_int32 doacross_buf_idx; // teamwise index
+  kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
+  kmp_int32 doacross_num_done; // count finished threads
+#endif
+#if KMP_USE_HIER_SCHED
+  kmp_hier_t<T> *hier;
+#endif
+#if KMP_USE_HWLOC
+  // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
+  // machines (> 48 cores). Performance analysis showed that a cache thrash
+  // was occurring and this padding helps alleviate the problem.
+  char padding[64];
+#endif
+};
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+#undef USE_TEST_LOCKS
+
+// test_then_add template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
+
+template <>
+__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
+                                                 kmp_int32 d) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_ADD32(p, d);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
+                                                 kmp_int64 d) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_ADD64(p, d);
+  return r;
+}
+
+// test_then_inc_acq template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC_ACQ32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC_ACQ64(p);
+  return r;
+}
+
+// test_then_inc template (general template should NOT be used)
+template <typename T> static __forceinline T test_then_inc(volatile T *p);
+
+template <>
+__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
+  kmp_int32 r;
+  r = KMP_TEST_THEN_INC32(p);
+  return r;
+}
+
+template <>
+__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
+  kmp_int64 r;
+  r = KMP_TEST_THEN_INC64(p);
+  return r;
+}
+
+// compare_and_swap template (general template should NOT be used)
+template <typename T>
+static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
+                                                    kmp_int32 c, kmp_int32 s) {
+  return KMP_COMPARE_AND_STORE_REL32(p, c, s);
+}
+
+template <>
+__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
+                                                    kmp_int64 c, kmp_int64 s) {
+  return KMP_COMPARE_AND_STORE_REL64(p, c, s);
+}
+
+template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
+  return value >= checker;
+}
+template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
+  return value == checker;
+}
+
+/*
+    Spin wait loop that first does pause, then yield.
+    Waits until function returns non-zero when called with *spinner and check.
+    Does NOT put threads to sleep.
+    Arguments:
+        UT is unsigned 4- or 8-byte type
+        spinner - memory location to check value
+        checker - value which spinner is >, <, ==, etc.
+        pred - predicate function to perform binary comparison of some sort
+#if USE_ITT_BUILD
+        obj -- is higher-level synchronization object to report to ittnotify. It
+        is used to report locks consistently. For example, if lock is acquired
+        immediately, its address is reported to ittnotify via
+        KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
+        and lock routine calls to KMP_WAIT_YIELD(), the later should report the
+        same address, not an address of low-level spinner.
+#endif // USE_ITT_BUILD
+    TODO: make inline function (move to header file for icl)
+*/
+template <typename UT>
+static UT __kmp_wait_yield(volatile UT *spinner, UT checker,
+                           kmp_uint32 (*pred)(UT, UT)
+                               USE_ITT_BUILD_ARG(void *obj)) {
+  // note: we may not belong to a team at this point
+  volatile UT *spin = spinner;
+  UT check = checker;
+  kmp_uint32 spins;
+  kmp_uint32 (*f)(UT, UT) = pred;
+  UT r;
+
+  KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
+  KMP_INIT_YIELD(spins);
+  // main wait spin loop
+  while (!f(r = *spin, check)) {
+    KMP_FSYNC_SPIN_PREPARE(obj);
+    /* GEH - remove this since it was accidentally introduced when kmp_wait was
+       split.
+       It causes problems with infinite recursion because of exit lock */
+    /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
+        __kmp_abort_thread(); */
+
+    // if we are oversubscribed,
+    // or have waited a bit (and KMP_LIBRARY=throughput, then yield
+    // pause is in the following code
+    KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
+    KMP_YIELD_SPIN(spins);
+  }
+  KMP_FSYNC_SPIN_ACQUIRED(obj);
+  return r;
+}
+
+/* ------------------------------------------------------------------------ */
+/* ------------------------------------------------------------------------ */
+
+template <typename UT>
+void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  typedef typename traits_t<UT>::signed_t ST;
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+#if KMP_USE_DYNAMIC_LOCK
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
+#else
+      __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
+#endif
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+    UT lower;
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+    lower = pr->u.p.ordered_lower;
+
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB();
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+    __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
+                         __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
+    KMP_MB(); /* is this necessary? */
+#ifdef KMP_DEBUG
+    {
+      char *buff;
+      // create format specifiers before the debug output
+      buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
+                              "ordered_iter:%%%s lower:%%%s\n",
+                              traits_t<UT>::spec, traits_t<UT>::spec);
+      KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
+      __kmp_str_free(&buff);
+    }
+#endif
+  }
+  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
+}
+
+template <typename UT>
+void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
+  typedef typename traits_t<UT>::signed_t ST;
+  dispatch_private_info_template<UT> *pr;
+
+  int gtid = *gtid_ref;
+  //    int  cid = *cid_ref;
+  kmp_info_t *th = __kmp_threads[gtid];
+  KMP_DEBUG_ASSERT(th->th.th_dispatch);
+
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
+  if (__kmp_env_consistency_check) {
+    pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+        th->th.th_dispatch->th_dispatch_pr_current);
+    if (pr->pushed_ws != ct_none) {
+      __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
+    }
+  }
+
+  if (!th->th.th_team->t.t_serialized) {
+    dispatch_shared_info_template<UT> *sh =
+        reinterpret_cast<dispatch_shared_info_template<UT> *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+
+    if (!__kmp_env_consistency_check) {
+      pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
+          th->th.th_dispatch->th_dispatch_pr_current);
+    }
+
+    KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
+#if !defined(KMP_GOMP_COMPAT)
+    if (__kmp_env_consistency_check) {
+      if (pr->ordered_bumped != 0) {
+        struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
+        /* How to test it? - OM */
+        __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
+                               ct_ordered_in_pdo, loc_ref,
+                               &p->stack_data[p->w_top]);
+      }
+    }
+#endif /* !defined(KMP_GOMP_COMPAT) */
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    pr->ordered_bumped += 1;
+
+    KD_TRACE(1000,
+             ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
+              gtid, pr->ordered_bumped));
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+
+    /* TODO use general release procedure? */
+    test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
+
+    KMP_MB(); /* Flush all pending memory write invalidates.  */
+  }
+  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
+}
+
+/* Computes and returns x to the power of y, where y must a non-negative integer
+ */
+template <typename UT>
+static __forceinline long double __kmp_pow(long double x, UT y) {
+  long double s = 1.0L;
+
+  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
+  // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
+  while (y) {
+    if (y & 1)
+      s *= x;
+    x *= x;
+    y >>= 1;
+  }
+  return s;
+}
+
+/* Computes and returns the number of unassigned iterations after idx chunks
+   have been assigned
+   (the total number of unassigned iterations in chunks with index greater than
+   or equal to idx).
+   __forceinline seems to be broken so that if we __forceinline this function,
+   the behavior is wrong
+   (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
+*/
+template <typename T>
+static __inline typename traits_t<T>::unsigned_t
+__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
+                                typename traits_t<T>::unsigned_t idx) {
+  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
+     least for ICL 8.1, long double arithmetic may not really have
+     long double precision, even with /Qlong_double.  Currently, we
+     workaround that in the caller code, by manipulating the FPCW for
+     Windows* OS on IA-32 architecture.  The lack of precision is not
+     expected to be a correctness issue, though.
+  */
+  typedef typename traits_t<T>::unsigned_t UT;
+
+  long double x = tc * __kmp_pow<UT>(base, idx);
+  UT r = (UT)x;
+  if (x == r)
+    return r;
+  return r + 1;
+}
+
+// Parameters of the guided-iterative algorithm:
+//   p2 = n * nproc * ( chunk + 1 )  // point of switching to dynamic
+//   p3 = 1 / ( n * nproc )          // remaining iterations multiplier
+// by default n = 2. For example with n = 3 the chunks distribution will be more
+// flat.
+// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
+static const int guided_int_param = 2;
+static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
+#endif // KMP_DISPATCH_H
diff --git a/runtime/src/kmp_dispatch_hier.h b/runtime/src/kmp_dispatch_hier.h
new file mode 100644
index 000000000..8277eaa5a
--- /dev/null
+++ b/runtime/src/kmp_dispatch_hier.h
@@ -0,0 +1,1090 @@
+#ifndef KMP_DISPATCH_HIER_H
+#define KMP_DISPATCH_HIER_H
+#include "kmp.h"
+#include "kmp_dispatch.h"
+
+// Layer type for scheduling hierarchy
+enum kmp_hier_layer_e {
+  LAYER_THREAD = -1,
+  LAYER_L1,
+  LAYER_L2,
+  LAYER_L3,
+  LAYER_NUMA,
+  LAYER_LOOP,
+  LAYER_LAST
+};
+
+// Convert hierarchy type (LAYER_L1, LAYER_L2, etc.) to C-style string
+static inline const char *__kmp_get_hier_str(kmp_hier_layer_e type) {
+  switch (type) {
+  case kmp_hier_layer_e::LAYER_THREAD:
+    return "THREAD";
+  case kmp_hier_layer_e::LAYER_L1:
+    return "L1";
+  case kmp_hier_layer_e::LAYER_L2:
+    return "L2";
+  case kmp_hier_layer_e::LAYER_L3:
+    return "L3";
+  case kmp_hier_layer_e::LAYER_NUMA:
+    return "NUMA";
+  case kmp_hier_layer_e::LAYER_LOOP:
+    return "WHOLE_LOOP";
+  case kmp_hier_layer_e::LAYER_LAST:
+    return "LAST";
+  }
+  KMP_ASSERT(0);
+  // Appease compilers, should never get here
+  return "ERROR";
+}
+
+// Structure to store values parsed from OMP_SCHEDULE for scheduling hierarchy
+typedef struct kmp_hier_sched_env_t {
+  int size;
+  int capacity;
+  enum sched_type *scheds;
+  kmp_int32 *small_chunks;
+  kmp_int64 *large_chunks;
+  kmp_hier_layer_e *layers;
+  // Append a level of the hierarchy
+  void append(enum sched_type sched, kmp_int32 chunk, kmp_hier_layer_e layer) {
+    if (capacity == 0) {
+      scheds = (enum sched_type *)__kmp_allocate(sizeof(enum sched_type) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      small_chunks = (kmp_int32 *)__kmp_allocate(sizeof(kmp_int32) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      large_chunks = (kmp_int64 *)__kmp_allocate(sizeof(kmp_int64) *
+                                                 kmp_hier_layer_e::LAYER_LAST);
+      layers = (kmp_hier_layer_e *)__kmp_allocate(sizeof(kmp_hier_layer_e) *
+                                                  kmp_hier_layer_e::LAYER_LAST);
+      capacity = kmp_hier_layer_e::LAYER_LAST;
+    }
+    int current_size = size;
+    KMP_DEBUG_ASSERT(current_size < kmp_hier_layer_e::LAYER_LAST);
+    scheds[current_size] = sched;
+    layers[current_size] = layer;
+    small_chunks[current_size] = chunk;
+    large_chunks[current_size] = (kmp_int64)chunk;
+    size++;
+  }
+  // Sort the hierarchy using selection sort, size will always be small
+  // (less than LAYER_LAST) so it is not necessary to use an nlog(n) algorithm
+  void sort() {
+    if (size <= 1)
+      return;
+    for (int i = 0; i < size; ++i) {
+      int switch_index = i;
+      for (int j = i + 1; j < size; ++j) {
+        if (layers[j] < layers[switch_index])
+          switch_index = j;
+      }
+      if (switch_index != i) {
+        kmp_hier_layer_e temp1 = layers[i];
+        enum sched_type temp2 = scheds[i];
+        kmp_int32 temp3 = small_chunks[i];
+        kmp_int64 temp4 = large_chunks[i];
+        layers[i] = layers[switch_index];
+        scheds[i] = scheds[switch_index];
+        small_chunks[i] = small_chunks[switch_index];
+        large_chunks[i] = large_chunks[switch_index];
+        layers[switch_index] = temp1;
+        scheds[switch_index] = temp2;
+        small_chunks[switch_index] = temp3;
+        large_chunks[switch_index] = temp4;
+      }
+    }
+  }
+  // Free all memory
+  void deallocate() {
+    if (capacity > 0) {
+      __kmp_free(scheds);
+      __kmp_free(layers);
+      __kmp_free(small_chunks);
+      __kmp_free(large_chunks);
+      scheds = NULL;
+      layers = NULL;
+      small_chunks = NULL;
+      large_chunks = NULL;
+    }
+    size = 0;
+    capacity = 0;
+  }
+} kmp_hier_sched_env_t;
+
+extern int __kmp_dispatch_hand_threading;
+extern kmp_hier_sched_env_t __kmp_hier_scheds;
+
+// Sizes of layer arrays bounded by max number of detected L1s, L2s, etc.
+extern int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+extern int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+
+extern int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_id(int gtid, kmp_hier_layer_e type);
+extern int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,
+                                        kmp_hier_layer_e t2);
+extern void __kmp_dispatch_free_hierarchies(kmp_team_t *team);
+
+template <typename T> struct kmp_hier_shared_bdata_t {
+  typedef typename traits_t<T>::signed_t ST;
+  volatile kmp_uint64 val[2];
+  kmp_int32 status[2];
+  T lb[2];
+  T ub[2];
+  ST st[2];
+  dispatch_shared_info_template<T> sh[2];
+  void zero() {
+    val[0] = val[1] = 0;
+    status[0] = status[1] = 0;
+    lb[0] = lb[1] = 0;
+    ub[0] = ub[1] = 0;
+    st[0] = st[1] = 0;
+    sh[0].u.s.iteration = sh[1].u.s.iteration = 0;
+  }
+  void set_next_hand_thread(T nlb, T nub, ST nst, kmp_int32 nstatus,
+                            kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+  }
+  void set_next(T nlb, T nub, ST nst, kmp_int32 nstatus, kmp_uint64 index) {
+    lb[1 - index] = nlb;
+    ub[1 - index] = nub;
+    st[1 - index] = nst;
+    status[1 - index] = nstatus;
+    sh[1 - index].u.s.iteration = 0;
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return status[1 - index];
+  }
+  T get_next_lb(kmp_uint64 index) const { return lb[1 - index]; }
+  T get_next_ub(kmp_uint64 index) const { return ub[1 - index]; }
+  ST get_next_st(kmp_uint64 index) const { return st[1 - index]; }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return &(sh[1 - index]);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const { return status[index]; }
+  T get_curr_lb(kmp_uint64 index) const { return lb[index]; }
+  T get_curr_ub(kmp_uint64 index) const { return ub[index]; }
+  ST get_curr_st(kmp_uint64 index) const { return st[index]; }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return &(sh[index]);
+  }
+};
+
+/*
+ * In the barrier implementations, num_active is the number of threads that are
+ * attached to the kmp_hier_top_unit_t structure in the scheduling hierarchy.
+ * bdata is the shared barrier data that resides on the kmp_hier_top_unit_t
+ * structure. tdata is the thread private data that resides on the thread
+ * data structure.
+ *
+ * The reset_shared() method is used to initialize the barrier data on the
+ * kmp_hier_top_unit_t hierarchy structure
+ *
+ * The reset_private() method is used to initialize the barrier data on the
+ * thread's private dispatch buffer structure
+ *
+ * The barrier() method takes an id, which is that thread's id for the
+ * kmp_hier_top_unit_t structure, and implements the barrier.  All threads wait
+ * inside barrier() until all fellow threads who are attached to that
+ * kmp_hier_top_unit_t structure have arrived.
+ */
+
+// Core barrier implementation
+// Can be used in a unit with between 2 to 8 threads
+template <typename T> class core_barrier_impl {
+  static inline kmp_uint64 get_wait_val(int num_active) {
+    kmp_uint64 wait_val;
+    switch (num_active) {
+    case 2:
+      wait_val = 0x0101LL;
+      break;
+    case 3:
+      wait_val = 0x010101LL;
+      break;
+    case 4:
+      wait_val = 0x01010101LL;
+      break;
+    case 5:
+      wait_val = 0x0101010101LL;
+      break;
+    case 6:
+      wait_val = 0x010101010101LL;
+      break;
+    case 7:
+      wait_val = 0x01010101010101LL;
+      break;
+    case 8:
+      wait_val = 0x0101010101010101LL;
+      break;
+    default:
+      // don't use the core_barrier_impl for more than 8 threads
+      KMP_ASSERT(0);
+    }
+    return wait_val;
+  }
+
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void core_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                         kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = get_wait_val(num_active);
+}
+template <typename T>
+void core_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                        kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void core_barrier_impl<T>::barrier(kmp_int32 id,
+                                   kmp_hier_shared_bdata_t<T> *bdata,
+                                   kmp_hier_private_bdata_t *tdata) {
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value =
+      (current_wait_value ? 0 : get_wait_val(tdata->num_active));
+  KD_TRACE(10, ("core_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  char v = (current_wait_value ? 0x1 : 0x0);
+  (RCAST(volatile char *, &(bdata->val[current_index])))[id] = v;
+  __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                               __kmp_eq<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Counter barrier implementation
+// Can be used in a unit with arbitrary number of active threads
+template <typename T> class counter_barrier_impl {
+public:
+  static void reset_private(kmp_int32 num_active,
+                            kmp_hier_private_bdata_t *tdata);
+  static void reset_shared(kmp_int32 num_active,
+                           kmp_hier_shared_bdata_t<T> *bdata);
+  static void barrier(kmp_int32 id, kmp_hier_shared_bdata_t<T> *bdata,
+                      kmp_hier_private_bdata_t *tdata);
+};
+
+template <typename T>
+void counter_barrier_impl<T>::reset_private(kmp_int32 num_active,
+                                            kmp_hier_private_bdata_t *tdata) {
+  tdata->num_active = num_active;
+  tdata->index = 0;
+  tdata->wait_val[0] = tdata->wait_val[1] = (kmp_uint64)num_active;
+}
+template <typename T>
+void counter_barrier_impl<T>::reset_shared(kmp_int32 num_active,
+                                           kmp_hier_shared_bdata_t<T> *bdata) {
+  bdata->val[0] = bdata->val[1] = 0LL;
+  bdata->status[0] = bdata->status[1] = 0LL;
+}
+template <typename T>
+void counter_barrier_impl<T>::barrier(kmp_int32 id,
+                                      kmp_hier_shared_bdata_t<T> *bdata,
+                                      kmp_hier_private_bdata_t *tdata) {
+  volatile kmp_int64 *val;
+  kmp_uint64 current_index = tdata->index;
+  kmp_uint64 next_index = 1 - current_index;
+  kmp_uint64 current_wait_value = tdata->wait_val[current_index];
+  kmp_uint64 next_wait_value = current_wait_value + tdata->num_active;
+
+  KD_TRACE(10, ("counter_barrier_impl::barrier(): T#%d current_index:%llu "
+                "next_index:%llu curr_wait:%llu next_wait:%llu\n",
+                __kmp_get_gtid(), current_index, next_index, current_wait_value,
+                next_wait_value));
+  val = RCAST(volatile kmp_int64 *, &(bdata->val[current_index]));
+  KMP_TEST_THEN_INC64(val);
+  __kmp_wait_yield<kmp_uint64>(&(bdata->val[current_index]), current_wait_value,
+                               __kmp_ge<kmp_uint64> USE_ITT_BUILD_ARG(NULL));
+  tdata->wait_val[current_index] = next_wait_value;
+  tdata->index = next_index;
+}
+
+// Data associated with topology unit within a layer
+// For example, one kmp_hier_top_unit_t corresponds to one L1 cache
+template <typename T> struct kmp_hier_top_unit_t {
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::unsigned_t UT;
+  kmp_int32 active; // number of topology units that communicate with this unit
+  // chunk information (lower/upper bound, stride, etc.)
+  dispatch_private_info_template<T> hier_pr;
+  kmp_hier_top_unit_t<T> *hier_parent; // pointer to parent unit
+  kmp_hier_shared_bdata_t<T> hier_barrier; // shared barrier data for this unit
+
+  kmp_int32 get_hier_id() const { return hier_pr.hier_id; }
+  void reset_shared_barrier() {
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    hier_barrier.zero();
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    } else {
+      counter_barrier_impl<T>::reset_shared(active, &hier_barrier);
+    }
+  }
+  void reset_private_barrier(kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    if (active == 1)
+      return;
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::reset_private(active, tdata);
+    } else {
+      counter_barrier_impl<T>::reset_private(active, tdata);
+    }
+  }
+  void barrier(kmp_int32 id, kmp_hier_private_bdata_t *tdata) {
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(active > 0);
+    KMP_DEBUG_ASSERT(id >= 0 && id < active);
+    if (active == 1) {
+      tdata->index = 1 - tdata->index;
+      return;
+    }
+    if (active >= 2 && active <= 8) {
+      core_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    } else {
+      counter_barrier_impl<T>::barrier(id, &hier_barrier, tdata);
+    }
+  }
+
+  kmp_int32 get_next_status(kmp_uint64 index) const {
+    return hier_barrier.get_next_status(index);
+  }
+  T get_next_lb(kmp_uint64 index) const {
+    return hier_barrier.get_next_lb(index);
+  }
+  T get_next_ub(kmp_uint64 index) const {
+    return hier_barrier.get_next_ub(index);
+  }
+  ST get_next_st(kmp_uint64 index) const {
+    return hier_barrier.get_next_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_next_sh(kmp_uint64 index) {
+    return hier_barrier.get_next_sh(index);
+  }
+
+  kmp_int32 get_curr_status(kmp_uint64 index) const {
+    return hier_barrier.get_curr_status(index);
+  }
+  T get_curr_lb(kmp_uint64 index) const {
+    return hier_barrier.get_curr_lb(index);
+  }
+  T get_curr_ub(kmp_uint64 index) const {
+    return hier_barrier.get_curr_ub(index);
+  }
+  ST get_curr_st(kmp_uint64 index) const {
+    return hier_barrier.get_curr_st(index);
+  }
+  dispatch_shared_info_template<T> volatile *get_curr_sh(kmp_uint64 index) {
+    return hier_barrier.get_curr_sh(index);
+  }
+
+  void set_next_hand_thread(T lb, T ub, ST st, kmp_int32 status,
+                            kmp_uint64 index) {
+    hier_barrier.set_next_hand_thread(lb, ub, st, status, index);
+  }
+  void set_next(T lb, T ub, ST st, kmp_int32 status, kmp_uint64 index) {
+    hier_barrier.set_next(lb, ub, st, status, index);
+  }
+  dispatch_private_info_template<T> *get_my_pr() { return &hier_pr; }
+  kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
+  dispatch_private_info_template<T> *get_parent_pr() {
+    return &(hier_parent->hier_pr);
+  }
+
+  kmp_int32 is_active() const { return active; }
+  kmp_int32 get_num_active() const { return active; }
+  void print() {
+    KD_TRACE(
+        10,
+        ("    kmp_hier_top_unit_t: active:%d pr:%p lb:%d ub:%d st:%d tc:%d\n",
+         active, &hier_pr, hier_pr.u.p.lb, hier_pr.u.p.ub, hier_pr.u.p.st,
+         hier_pr.u.p.tc));
+  }
+};
+
+// Information regarding a single layer within the scheduling hierarchy
+template <typename T> struct kmp_hier_layer_info_t {
+  int num_active; // number of threads active in this level
+  kmp_hier_layer_e type; // LAYER_L1, LAYER_L2, etc.
+  enum sched_type sched; // static, dynamic, guided, etc.
+  typename traits_t<T>::signed_t chunk; // chunk size associated with schedule
+  int length; // length of the kmp_hier_top_unit_t array
+
+  // Print this layer's information
+  void print() {
+    const char *t = __kmp_get_hier_str(type);
+    KD_TRACE(
+        10,
+        ("    kmp_hier_layer_info_t: num_active:%d type:%s sched:%d chunk:%d "
+         "length:%d\n",
+         num_active, t, sched, chunk, length));
+  }
+};
+
+/*
+ * Structure to implement entire hierarchy
+ *
+ * The hierarchy is kept as an array of arrays to represent the different
+ * layers.  Layer 0 is the lowest layer to layer num_layers - 1 which is the
+ * highest layer.
+ * Example:
+ * [ 2 ] -> [ L3 | L3 ]
+ * [ 1 ] -> [ L2 | L2 | L2 | L2 ]
+ * [ 0 ] -> [ L1 | L1 | L1 | L1 | L1 | L1 | L1 | L1 ]
+ * There is also an array of layer_info_t which has information regarding
+ * each layer
+ */
+template <typename T> struct kmp_hier_t {
+public:
+  typedef typename traits_t<T>::unsigned_t UT;
+  typedef typename traits_t<T>::signed_t ST;
+
+private:
+  int next_recurse(ident_t *loc, int gtid, kmp_hier_top_unit_t<T> *current,
+                   kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st,
+                   kmp_int32 previous_id, int hier_level) {
+    int status;
+    kmp_info_t *th = __kmp_threads[gtid];
+    auto parent = current->get_parent();
+    bool last_layer = (hier_level == get_num_layers() - 1);
+    KMP_DEBUG_ASSERT(th);
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[hier_level]);
+    KMP_DEBUG_ASSERT(current);
+    KMP_DEBUG_ASSERT(hier_level >= 0);
+    KMP_DEBUG_ASSERT(hier_level < get_num_layers());
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent || last_layer);
+
+    KD_TRACE(
+        1, ("kmp_hier_t.next_recurse(): T#%d (%d) called\n", gtid, hier_level));
+
+    T hier_id = (T)current->get_hier_id();
+    // Attempt to grab next iteration range for this level
+    if (previous_id == 0) {
+      KD_TRACE(1, ("kmp_hier_t.next_recurse(): T#%d (%d) is master of unit\n",
+                   gtid, hier_level));
+      kmp_int32 contains_last;
+      T my_lb, my_ub;
+      ST my_st;
+      T nproc;
+      dispatch_shared_info_template<T> volatile *my_sh;
+      dispatch_private_info_template<T> *my_pr;
+      if (last_layer) {
+        // last layer below the very top uses the single shared buffer
+        // from the team struct.
+        KD_TRACE(10,
+                 ("kmp_hier_t.next_recurse(): T#%d (%d) using top level sh\n",
+                  gtid, hier_level));
+        my_sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+            th->th.th_dispatch->th_dispatch_sh_current);
+        nproc = (T)get_top_level_nproc();
+      } else {
+        // middle layers use the shared buffer inside the kmp_hier_top_unit_t
+        // structure
+        KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) using hier sh\n",
+                      gtid, hier_level));
+        my_sh =
+            parent->get_curr_sh(th->th.th_hier_bar_data[hier_level + 1].index);
+        nproc = (T)parent->get_num_active();
+      }
+      my_pr = current->get_my_pr();
+      KMP_DEBUG_ASSERT(my_sh);
+      KMP_DEBUG_ASSERT(my_pr);
+      enum sched_type schedule = get_sched(hier_level);
+      ST chunk = (ST)get_chunk(hier_level);
+      status = __kmp_dispatch_next_algorithm<T>(gtid, my_pr, my_sh,
+                                                &contains_last, &my_lb, &my_ub,
+                                                &my_st, nproc, hier_id);
+      KD_TRACE(
+          10,
+          ("kmp_hier_t.next_recurse(): T#%d (%d) next_pr_sh() returned %d\n",
+           gtid, hier_level, status));
+      // When no iterations are found (status == 0) and this is not the last
+      // layer, attempt to go up the hierarchy for more iterations
+      if (status == 0 && !last_layer) {
+        status = next_recurse(loc, gtid, parent, &contains_last, &my_lb, &my_ub,
+                              &my_st, hier_id, hier_level + 1);
+        KD_TRACE(
+            10,
+            ("kmp_hier_t.next_recurse(): T#%d (%d) hier_next() returned %d\n",
+             gtid, hier_level, status));
+        if (status == 1) {
+          kmp_hier_private_bdata_t *upper_tdata =
+              &(th->th.th_hier_bar_data[hier_level + 1]);
+          my_sh = parent->get_curr_sh(upper_tdata->index);
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) about to init\n",
+                        gtid, hier_level));
+          __kmp_dispatch_init_algorithm(loc, gtid, my_pr, schedule,
+                                        parent->get_curr_lb(upper_tdata->index),
+                                        parent->get_curr_ub(upper_tdata->index),
+                                        parent->get_curr_st(upper_tdata->index),
+#if USE_ITT_BUILD
+                                        NULL,
+#endif
+                                        chunk, nproc, hier_id);
+          status = __kmp_dispatch_next_algorithm<T>(
+              gtid, my_pr, my_sh, &contains_last, &my_lb, &my_ub, &my_st, nproc,
+              hier_id);
+          if (!status) {
+            KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) status not 1 "
+                          "setting to 2!\n",
+                          gtid, hier_level));
+            status = 2;
+          }
+        }
+      }
+      current->set_next(my_lb, my_ub, my_st, status, tdata->index);
+      // Propagate whether a unit holds the actual global last iteration
+      // The contains_last attribute is sent downwards from the top to the
+      // bottom of the hierarchy via the contains_last flag inside the
+      // private dispatch buffers in the hierarchy's middle layers
+      if (contains_last) {
+        // If the next_algorithm() method returns 1 for p_last and it is the
+        // last layer or our parent contains the last serial chunk, then the
+        // chunk must contain the last serial iteration.
+        if (last_layer || parent->hier_pr.flags.contains_last) {
+          KD_TRACE(10, ("kmp_hier_t.next_recurse(): T#%d (%d) Setting this pr "
+                        "to contain last.\n",
+                        gtid, hier_level));
+          current->hier_pr.flags.contains_last = contains_last;
+        }
+        if (!current->hier_pr.flags.contains_last)
+          contains_last = FALSE;
+      }
+      if (p_last)
+        *p_last = contains_last;
+    } // if master thread of this unit
+    if (hier_level > 0 || !__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) going into barrier.\n",
+                gtid, hier_level));
+      current->barrier(previous_id, tdata);
+      KD_TRACE(10,
+               ("kmp_hier_t.next_recurse(): T#%d (%d) released and exit %d\n",
+                gtid, hier_level, current->get_curr_status(tdata->index)));
+    } else {
+      KMP_DEBUG_ASSERT(previous_id == 0);
+      return status;
+    }
+    return current->get_curr_status(tdata->index);
+  }
+
+public:
+  int top_level_nproc;
+  int num_layers;
+  bool valid;
+  int type_size;
+  kmp_hier_layer_info_t<T> *info;
+  kmp_hier_top_unit_t<T> **layers;
+  // Deallocate all memory from this hierarchy
+  void deallocate() {
+    for (int i = 0; i < num_layers; ++i)
+      if (layers[i] != NULL) {
+        __kmp_free(layers[i]);
+      }
+    if (layers != NULL) {
+      __kmp_free(layers);
+      layers = NULL;
+    }
+    if (info != NULL) {
+      __kmp_free(info);
+      info = NULL;
+    }
+    num_layers = 0;
+    valid = false;
+  }
+  // Returns true if reallocation is needed else false
+  bool need_to_reallocate(int n, const kmp_hier_layer_e *new_layers,
+                          const enum sched_type *new_scheds,
+                          const ST *new_chunks) const {
+    if (!valid || layers == NULL || info == NULL ||
+        traits_t<T>::type_size != type_size || n != num_layers)
+      return true;
+    for (int i = 0; i < n; ++i) {
+      if (info[i].type != new_layers[i])
+        return true;
+      if (info[i].sched != new_scheds[i])
+        return true;
+      if (info[i].chunk != new_chunks[i])
+        return true;
+    }
+    return false;
+  }
+  // A single thread should call this function while the other threads wait
+  // create a new scheduling hierarchy consisting of new_layers, new_scheds
+  // and new_chunks.  These should come pre-sorted according to
+  // kmp_hier_layer_e value.  This function will try to avoid reallocation
+  // if it can
+  void allocate_hier(int n, const kmp_hier_layer_e *new_layers,
+                     const enum sched_type *new_scheds, const ST *new_chunks) {
+    top_level_nproc = 0;
+    if (!need_to_reallocate(n, new_layers, new_scheds, new_chunks)) {
+      KD_TRACE(
+          10,
+          ("kmp_hier_t<T>::allocate_hier: T#0 do not need to reallocate\n"));
+      for (int i = 0; i < n; ++i) {
+        info[i].num_active = 0;
+        for (int j = 0; j < get_length(i); ++j)
+          layers[i][j].active = 0;
+      }
+      return;
+    }
+    KD_TRACE(10, ("kmp_hier_t<T>::allocate_hier: T#0 full alloc\n"));
+    deallocate();
+    type_size = traits_t<T>::type_size;
+    num_layers = n;
+    info = (kmp_hier_layer_info_t<T> *)__kmp_allocate(
+        sizeof(kmp_hier_layer_info_t<T>) * n);
+    layers = (kmp_hier_top_unit_t<T> **)__kmp_allocate(
+        sizeof(kmp_hier_top_unit_t<T> *) * n);
+    for (int i = 0; i < n; ++i) {
+      int max = 0;
+      kmp_hier_layer_e layer = new_layers[i];
+      info[i].num_active = 0;
+      info[i].type = layer;
+      info[i].sched = new_scheds[i];
+      info[i].chunk = new_chunks[i];
+      max = __kmp_hier_max_units[layer + 1];
+      if (max == 0) {
+        valid = false;
+        KMP_WARNING(HierSchedInvalid, __kmp_get_hier_str(layer));
+        deallocate();
+        return;
+      }
+      info[i].length = max;
+      layers[i] = (kmp_hier_top_unit_t<T> *)__kmp_allocate(
+          sizeof(kmp_hier_top_unit_t<T>) * max);
+      for (int j = 0; j < max; ++j) {
+        layers[i][j].active = 0;
+      }
+    }
+    valid = true;
+  }
+  // loc - source file location
+  // gtid - global thread identifier
+  // pr - this thread's private dispatch buffer (corresponding with gtid)
+  // p_last (return value) - pointer to flag indicating this set of iterations
+  // contains last
+  //          iteration
+  // p_lb (return value) - lower bound for this chunk of iterations
+  // p_ub (return value) - upper bound for this chunk of iterations
+  // p_st (return value) - stride for this chunk of iterations
+  //
+  // Returns 1 if there are more iterations to perform, 0 otherwise
+  int next(ident_t *loc, int gtid, dispatch_private_info_template<T> *pr,
+           kmp_int32 *p_last, T *p_lb, T *p_ub, ST *p_st) {
+    int status;
+    kmp_int32 contains_last = 0;
+    kmp_info_t *th = __kmp_threads[gtid];
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[0]);
+    auto parent = pr->get_parent();
+    KMP_DEBUG_ASSERT(parent);
+    KMP_DEBUG_ASSERT(th);
+    KMP_DEBUG_ASSERT(tdata);
+    KMP_DEBUG_ASSERT(parent);
+    T nproc = (T)parent->get_num_active();
+    T unit_id = (T)pr->get_hier_id();
+    KD_TRACE(
+        10,
+        ("kmp_hier_t.next(): T#%d THREAD LEVEL nproc:%d unit_id:%d called\n",
+         gtid, nproc, unit_id));
+    // Handthreading implementation
+    // Each iteration is performed by all threads on last unit (typically
+    // cores/tiles)
+    // e.g., threads 0,1,2,3 all execute iteration 0
+    //       threads 0,1,2,3 all execute iteration 1
+    //       threads 4,5,6,7 all execute iteration 2
+    //       threads 4,5,6,7 all execute iteration 3
+    //       ... etc.
+    if (__kmp_dispatch_hand_threading) {
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL using hand threading\n",
+                gtid));
+      if (unit_id == 0) {
+        // For hand threading, the sh buffer on the lowest level is only ever
+        // modified and read by the master thread on that level.  Because of
+        // this, we can always use the first sh buffer.
+        auto sh = &(parent->hier_barrier.sh[0]);
+        KMP_DEBUG_ASSERT(sh);
+        status = __kmp_dispatch_next_algorithm<T>(
+            gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+        if (!status) {
+          bool done = false;
+          while (!done) {
+            done = true;
+            status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                  p_st, unit_id, 0);
+            if (status == 1) {
+              __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                            parent->get_next_lb(tdata->index),
+                                            parent->get_next_ub(tdata->index),
+                                            parent->get_next_st(tdata->index),
+#if USE_ITT_BUILD
+                                            NULL,
+#endif
+                                            pr->u.p.parm1, nproc, unit_id);
+              sh->u.s.iteration = 0;
+              status = __kmp_dispatch_next_algorithm<T>(
+                  gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc,
+                  unit_id);
+              if (!status) {
+                KD_TRACE(10,
+                         ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                          "after next_pr_sh()"
+                          "trying again.\n",
+                          gtid));
+                done = false;
+              }
+            } else if (status == 2) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          }
+        }
+        parent->set_next_hand_thread(*p_lb, *p_ub, *p_st, status, tdata->index);
+      } // if master thread of lowest unit level
+      parent->barrier(pr->get_hier_id(), tdata);
+      if (unit_id != 0) {
+        *p_lb = parent->get_curr_lb(tdata->index);
+        *p_ub = parent->get_curr_ub(tdata->index);
+        *p_st = parent->get_curr_st(tdata->index);
+        status = parent->get_curr_status(tdata->index);
+      }
+    } else {
+      // Normal implementation
+      // Each thread grabs an iteration chunk and executes it (no cooperation)
+      auto sh = parent->get_curr_sh(tdata->index);
+      KMP_DEBUG_ASSERT(sh);
+      status = __kmp_dispatch_next_algorithm<T>(
+          gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+      KD_TRACE(10,
+               ("kmp_hier_t.next(): T#%d THREAD LEVEL next_algorithm status:%d "
+                "contains_last:%d p_lb:%d p_ub:%d p_st:%d\n",
+                gtid, status, contains_last, *p_lb, *p_ub, *p_st));
+      if (!status) {
+        bool done = false;
+        while (!done) {
+          done = true;
+          status = next_recurse(loc, gtid, parent, &contains_last, p_lb, p_ub,
+                                p_st, unit_id, 0);
+          if (status == 1) {
+            sh = parent->get_curr_sh(tdata->index);
+            __kmp_dispatch_init_algorithm(loc, gtid, pr, pr->schedule,
+                                          parent->get_curr_lb(tdata->index),
+                                          parent->get_curr_ub(tdata->index),
+                                          parent->get_curr_st(tdata->index),
+#if USE_ITT_BUILD
+                                          NULL,
+#endif
+                                          pr->u.p.parm1, nproc, unit_id);
+            status = __kmp_dispatch_next_algorithm<T>(
+                gtid, pr, sh, &contains_last, p_lb, p_ub, p_st, nproc, unit_id);
+            if (!status) {
+              KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 0 "
+                            "after next_pr_sh()"
+                            "trying again.\n",
+                            gtid));
+              done = false;
+            }
+          } else if (status == 2) {
+            KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL status == 2 "
+                          "trying again.\n",
+                          gtid));
+            done = false;
+          }
+        }
+      }
+    }
+    if (contains_last && !parent->hier_pr.flags.contains_last) {
+      KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL resetting "
+                    "contains_last to FALSE\n",
+                    gtid));
+      contains_last = FALSE;
+    }
+    if (p_last)
+      *p_last = contains_last;
+    KD_TRACE(10, ("kmp_hier_t.next(): T#%d THREAD LEVEL exit status %d\n", gtid,
+                  status));
+    return status;
+  }
+  // These functions probe the layer info structure
+  // Returns the type of topology unit given level
+  kmp_hier_layer_e get_type(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].type;
+  }
+  // Returns the schedule type at given level
+  enum sched_type get_sched(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].sched;
+  }
+  // Returns the chunk size at given level
+  ST get_chunk(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].chunk;
+  }
+  // Returns the number of active threads at given level
+  int get_num_active(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].num_active;
+  }
+  // Returns the length of topology unit array at given level
+  int get_length(int level) const {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    return info[level].length;
+  }
+  // Returns the topology unit given the level and index
+  kmp_hier_top_unit_t<T> *get_unit(int level, int index) {
+    KMP_DEBUG_ASSERT(level >= 0);
+    KMP_DEBUG_ASSERT(level < num_layers);
+    KMP_DEBUG_ASSERT(index >= 0);
+    KMP_DEBUG_ASSERT(index < get_length(level));
+    return &(layers[level][index]);
+  }
+  // Returns the number of layers in the hierarchy
+  int get_num_layers() const { return num_layers; }
+  // Returns the number of threads in the top layer
+  // This is necessary because we don't store a topology unit as
+  // the very top level and the scheduling algorithms need this information
+  int get_top_level_nproc() const { return top_level_nproc; }
+  // Return whether this hierarchy is valid or not
+  bool is_valid() const { return valid; }
+  // Print the hierarchy
+  void print() {
+    KD_TRACE(10, ("kmp_hier_t:\n"));
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Info[%d] = ", i));
+      info[i].print();
+    }
+    for (int i = num_layers - 1; i >= 0; --i) {
+      KD_TRACE(10, ("Layer[%d] =\n", i));
+      for (int j = 0; j < info[i].length; ++j) {
+        layers[i][j].print();
+      }
+    }
+  }
+};
+
+template <typename T>
+void __kmp_dispatch_init_hierarchy(ident_t *loc, int n,
+                                   kmp_hier_layer_e *new_layers,
+                                   enum sched_type *new_scheds,
+                                   typename traits_t<T>::signed_t *new_chunks,
+                                   T lb, T ub,
+                                   typename traits_t<T>::signed_t st) {
+  typedef typename traits_t<T>::signed_t ST;
+  typedef typename traits_t<T>::unsigned_t UT;
+  int tid, gtid, num_hw_threads, num_threads_per_layer1, active;
+  int my_buffer_index;
+  kmp_info_t *th;
+  kmp_team_t *team;
+  dispatch_private_info_template<T> *pr;
+  dispatch_shared_info_template<T> volatile *sh;
+  gtid = __kmp_entry_gtid();
+  tid = __kmp_tid_from_gtid(gtid);
+#ifdef KMP_DEBUG
+  KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d called: %d layer(s)\n",
+                gtid, n));
+  for (int i = 0; i < n; ++i) {
+    const char *layer = __kmp_get_hier_str(new_layers[i]);
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d: new_layers[%d] = %s, "
+                  "new_scheds[%d] = %d, new_chunks[%d] = %u\n",
+                  gtid, i, layer, i, (int)new_scheds[i], i, new_chunks[i]));
+  }
+#endif // KMP_DEBUG
+  KMP_DEBUG_ASSERT(n > 0);
+  KMP_DEBUG_ASSERT(new_layers);
+  KMP_DEBUG_ASSERT(new_scheds);
+  KMP_DEBUG_ASSERT(new_chunks);
+  if (!TCR_4(__kmp_init_parallel))
+    __kmp_parallel_initialize();
+  th = __kmp_threads[gtid];
+  team = th->th.th_team;
+  active = !team->t.t_serialized;
+  th->th.th_ident = loc;
+  num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
+  if (!active) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d not active parallel. "
+                  "Using normal dispatch functions.\n",
+                  gtid));
+    pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+        th->th.th_dispatch->th_disp_buffer);
+    KMP_DEBUG_ASSERT(pr);
+    pr->flags.use_hier = FALSE;
+    pr->flags.contains_last = FALSE;
+    return;
+  }
+  KMP_DEBUG_ASSERT(th->th.th_dispatch ==
+                   &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
+
+  my_buffer_index = th->th.th_dispatch->th_disp_index;
+  pr = reinterpret_cast<dispatch_private_info_template<T> *>(
+      &th->th.th_dispatch
+           ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
+      &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
+  KMP_DEBUG_ASSERT(pr);
+  KMP_DEBUG_ASSERT(sh);
+  pr->flags.use_hier = TRUE;
+  pr->u.p.tc = 0;
+  // Have master allocate the hierarchy
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d pr:%p sh:%p allocating "
+                  "hierarchy\n",
+                  gtid, pr, sh));
+    if (sh->hier == NULL) {
+      sh->hier = (kmp_hier_t<T> *)__kmp_allocate(sizeof(kmp_hier_t<T>));
+    }
+    sh->hier->allocate_hier(n, new_layers, new_scheds, new_chunks);
+    sh->u.s.iteration = 0;
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+  // Check to make sure the hierarchy is valid
+  kmp_hier_t<T> *hier = sh->hier;
+  if (!sh->hier->is_valid()) {
+    pr->flags.use_hier = FALSE;
+    return;
+  }
+  // Have threads allocate their thread-private barrier data if it hasn't
+  // already been allocated
+  if (th->th.th_hier_bar_data == NULL) {
+    th->th.th_hier_bar_data = (kmp_hier_private_bdata_t *)__kmp_allocate(
+        sizeof(kmp_hier_private_bdata_t) * kmp_hier_layer_e::LAYER_LAST);
+  }
+  // Have threads "register" themselves by modifiying the active count for each
+  // level they are involved in. The active count will act as nthreads for that
+  // level regarding the scheduling algorithms
+  for (int i = 0; i < n; ++i) {
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Setup the thread's private dispatch buffer's hierarchy pointers
+    if (i == 0)
+      pr->hier_parent = my_unit;
+    // If this unit is already active, then increment active count and wait
+    if (my_unit->is_active()) {
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "is already active (%d)\n",
+                    gtid, my_unit, my_unit->active));
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+    // Flag that this unit is active
+    if (KMP_COMPARE_AND_STORE_ACQ32(&(my_unit->active), 0, 1)) {
+      // Do not setup parent pointer for top level unit since it has no parent
+      if (i < n - 1) {
+        // Setup middle layer pointers to parents
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 hier->get_type(i + 1));
+        int parent_index = __kmp_dispatch_get_index(tid, hier->get_type(i + 1));
+        my_unit->hier_parent = hier->get_unit(i + 1, parent_index);
+      } else {
+        // Setup top layer information (no parent pointers are set)
+        my_unit->get_my_pr()->hier_id =
+            index % __kmp_dispatch_get_t1_per_t2(hier->get_type(i),
+                                                 kmp_hier_layer_e::LAYER_LOOP);
+        KMP_TEST_THEN_INC32(&(hier->top_level_nproc));
+        my_unit->hier_parent = nullptr;
+      }
+      // Set trip count to 0 so that next() operation will initially climb up
+      // the hierarchy to get more iterations (early exit in next() for tc == 0)
+      my_unit->get_my_pr()->u.p.tc = 0;
+      // Increment this layer's number of active units
+      KMP_TEST_THEN_INC32(&(hier->info[i].num_active));
+      KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d my_unit (%p) "
+                    "incrementing num_active\n",
+                    gtid, my_unit));
+    } else {
+      KMP_TEST_THEN_INC32(&(my_unit->active));
+      break;
+    }
+  }
+  // Set this thread's id
+  num_threads_per_layer1 = __kmp_dispatch_get_t1_per_t2(
+      kmp_hier_layer_e::LAYER_THREAD, hier->get_type(0));
+  pr->hier_id = tid % num_threads_per_layer1;
+  // For oversubscribed threads, increment their index within the lowest unit
+  // This is done to prevent having two or more threads with id 0, id 1, etc.
+  if (tid >= num_hw_threads)
+    pr->hier_id += ((tid / num_hw_threads) * num_threads_per_layer1);
+  KD_TRACE(
+      10, ("__kmp_dispatch_init_hierarchy: T#%d setting lowest hier_id to %d\n",
+           gtid, pr->hier_id));
+
+  pr->flags.contains_last = FALSE;
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+  // Now that the number of active threads at each level is determined,
+  // the barrier data for each unit can be initialized and the last layer's
+  // loop information can be initialized.
+  int prev_id = pr->get_hier_id();
+  for (int i = 0; i < n; ++i) {
+    if (prev_id != 0)
+      break;
+    int index = __kmp_dispatch_get_index(tid, hier->get_type(i));
+    kmp_hier_top_unit_t<T> *my_unit = hier->get_unit(i, index);
+    // Only master threads of this unit within the hierarchy do initialization
+    KD_TRACE(10, ("__kmp_dispatch_init_hierarchy: T#%d (%d) prev_id is 0\n",
+                  gtid, i));
+    my_unit->reset_shared_barrier();
+    my_unit->hier_pr.flags.contains_last = FALSE;
+    // Last layer, initialize the private buffers with entire loop information
+    // Now the next next_algorithim() call will get the first chunk of
+    // iterations properly
+    if (i == n - 1) {
+      __kmp_dispatch_init_algorithm<T>(
+          loc, gtid, my_unit->get_my_pr(), hier->get_sched(i), lb, ub, st,
+#if USE_ITT_BUILD
+          NULL,
+#endif
+          hier->get_chunk(i), hier->get_num_active(i), my_unit->get_hier_id());
+    }
+    prev_id = my_unit->get_hier_id();
+  }
+  // Initialize each layer of the thread's private barrier data
+  kmp_hier_top_unit_t<T> *unit = pr->hier_parent;
+  for (int i = 0; i < n && unit; ++i, unit = unit->get_parent()) {
+    kmp_hier_private_bdata_t *tdata = &(th->th.th_hier_bar_data[i]);
+    unit->reset_private_barrier(tdata);
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+
+#ifdef KMP_DEBUG
+  if (__kmp_tid_from_gtid(gtid) == 0) {
+    for (int i = 0; i < n; ++i) {
+      KD_TRACE(10,
+               ("__kmp_dispatch_init_hierarchy: T#%d active count[%d] = %d\n",
+                gtid, i, hier->get_num_active(i)));
+    }
+    hier->print();
+  }
+  __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
+#endif // KMP_DEBUG
+}
+#endif
diff --git a/runtime/src/kmp_global.cpp b/runtime/src/kmp_global.cpp
index 3ba0ec9fa..603e3a002 100644
--- a/runtime/src/kmp_global.cpp
+++ b/runtime/src/kmp_global.cpp
@@ -13,6 +13,9 @@
 
 #include "kmp.h"
 #include "kmp_affinity.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 kmp_key_t __kmp_gtid_threadprivate_key;
 
@@ -57,8 +60,8 @@ int __kmp_init_counter = 0;
 int __kmp_root_counter = 0;
 int __kmp_version = 0;
 
-volatile kmp_uint32 __kmp_team_counter = 0;
-volatile kmp_uint32 __kmp_task_counter = 0;
+std::atomic<kmp_uint32> __kmp_team_counter = ATOMIC_VAR_INIT(0);
+std::atomic<kmp_uint32> __kmp_task_counter = ATOMIC_VAR_INIT(0);
 
 unsigned int __kmp_init_wait =
     KMP_DEFAULT_INIT_WAIT; /* initial number of spin-tests   */
@@ -148,6 +151,12 @@ enum sched_type __kmp_guided =
     kmp_sch_guided_iterative_chunked; /* default guided scheduling method */
 enum sched_type __kmp_auto =
     kmp_sch_guided_analytical_chunked; /* default auto scheduling method */
+#if KMP_USE_HIER_SCHED
+int __kmp_dispatch_hand_threading = 0;
+int __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LAST + 1];
+int __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LAST + 1];
+kmp_hier_sched_env_t __kmp_hier_scheds = {0, 0, NULL, NULL, NULL};
+#endif
 int __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
 #if KMP_USE_MONITOR
 int __kmp_monitor_wakeups = KMP_MIN_MONITOR_WAKEUPS;
@@ -335,8 +344,8 @@ int __kmp_debug_buf_atomic =
     FALSE; /* TRUE means use atomic update of buffer entry pointer */
 
 char *__kmp_debug_buffer = NULL; /* Debug buffer itself */
-int __kmp_debug_count =
-    0; /* Counter for number of lines printed in buffer so far */
+std::atomic<int> __kmp_debug_count =
+    ATOMIC_VAR_INIT(0); /* number of lines printed in buffer so far */
 int __kmp_debug_buf_warn_chars =
     0; /* Keep track of char increase recommended in warnings */
 /* end rotating debug buffer */
@@ -402,7 +411,7 @@ volatile kmp_info_t *__kmp_thread_pool = NULL;
 volatile kmp_team_t *__kmp_team_pool = NULL;
 
 KMP_ALIGN_CACHE
-volatile int __kmp_thread_pool_active_nth = 0;
+std::atomic<int> __kmp_thread_pool_active_nth = ATOMIC_VAR_INIT(0);
 
 /* -------------------------------------------------
  * GLOBAL/ROOT STATE */
@@ -418,47 +427,47 @@ kmp_global_t __kmp_global = {{0}};
  * false sharing if the alignment is not large enough for these locks */
 KMP_ALIGN_CACHE_INTERNODE
 
-kmp_bootstrap_lock_t __kmp_initz_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
-    __kmp_initz_lock); /* Control initializations */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_exit_lock; /* exit() is not always thread-safe */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
+/* control monitor thread creation */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
 #endif
 /* used for the hack to allow threadprivate cache and __kmp_threads expansion
    to co-exist */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_bootstrap_lock_t __kmp_tp_cached_lock;
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
 
 KMP_ALIGN_CACHE_INTERNODE
-kmp_lock_t __kmp_global_lock; /* Control OS/global access */
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
 KMP_ALIGN_CACHE_INTERNODE
 kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
 KMP_ALIGN_CACHE_INTERNODE
-kmp_lock_t __kmp_debug_lock; /* Control I/O access for KMP_DEBUG */
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
 #else
 KMP_ALIGN_CACHE
 
-kmp_bootstrap_lock_t __kmp_initz_lock = KMP_BOOTSTRAP_LOCK_INITIALIZER(
-    __kmp_initz_lock); /* Control initializations */
-kmp_bootstrap_lock_t __kmp_forkjoin_lock; /* control fork/join access */
-kmp_bootstrap_lock_t __kmp_exit_lock; /* exit() is not always thread-safe */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_initz_lock); /* Control initializations */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_forkjoin_lock); /* control fork/join access */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_exit_lock); /* exit() is not always thread-safe */
 #if KMP_USE_MONITOR
-kmp_bootstrap_lock_t __kmp_monitor_lock; /* control monitor thread creation */
+/* control monitor thread creation */
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_monitor_lock);
 #endif
 /* used for the hack to allow threadprivate cache and __kmp_threads expansion
    to co-exist */
-kmp_bootstrap_lock_t __kmp_tp_cached_lock;
+KMP_BOOTSTRAP_LOCK_INIT(__kmp_tp_cached_lock);
 
 KMP_ALIGN(128)
-kmp_lock_t __kmp_global_lock; /* Control OS/global access */
+KMP_LOCK_INIT(__kmp_global_lock); /* Control OS/global access */
 KMP_ALIGN(128)
 kmp_queuing_lock_t __kmp_dispatch_lock; /* Control dispatch access  */
 KMP_ALIGN(128)
-kmp_lock_t __kmp_debug_lock; /* Control I/O access for KMP_DEBUG */
+KMP_LOCK_INIT(__kmp_debug_lock); /* Control I/O access for KMP_DEBUG */
 #endif
 
 /* ----------------------------------------------- */
diff --git a/runtime/src/kmp_gsupport.cpp b/runtime/src/kmp_gsupport.cpp
index 013c56b98..0d3f0bebf 100644
--- a/runtime/src/kmp_gsupport.cpp
+++ b/runtime/src/kmp_gsupport.cpp
@@ -32,7 +32,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_BARRIER)(void) {
   MKLOC(loc, "GOMP_barrier");
   KA_TRACE(20, ("GOMP_barrier: T#%d\n", gtid));
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
@@ -108,7 +108,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_START)(void) {
 
 void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_ATOMIC_END)(void) {
   int gtid = __kmp_get_gtid();
-  KA_TRACE(20, ("GOMP_atomic_start: T#%d\n", gtid));
+  KA_TRACE(20, ("GOMP_atomic_end: T#%d\n", gtid));
   __kmp_release_atomic_lock(&__kmp_atomic_lock, gtid);
 }
 
@@ -178,7 +178,7 @@ void *KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_START)(void) {
 // and for all other threads to reach this point.
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
@@ -214,7 +214,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SINGLE_COPY_END)(void *data) {
   // propagated to all threads before trying to reuse the t_copypriv_data field.
   __kmp_team_from_gtid(gtid)->t.t_copypriv_data = data;
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
@@ -284,7 +284,7 @@ static
                                  void *data) {
 #if OMPT_SUPPORT
   kmp_info_t *thr;
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   omp_state_t enclosing_state;
 
   if (ompt_enabled.enabled) {
@@ -331,7 +331,7 @@ static
 
 #if OMPT_SUPPORT
   kmp_info_t *thr;
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   omp_state_t enclosing_state;
 
   if (ompt_enabled.enabled) {
@@ -422,7 +422,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_START)(void (*task)(void *),
   int gtid = __kmp_entry_gtid();
 
 #if OMPT_SUPPORT
-  ompt_frame_t *parent_frame, *frame;
+  omp_frame_t *parent_frame, *frame;
 
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
@@ -647,7 +647,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_END)(void) {
   KA_TRACE(20, ("GOMP_loop_end: T#%d\n", gtid))
 
 #if OMPT_SUPPORT && OMPT_OPTIONAL
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
@@ -842,7 +842,7 @@ LOOP_NEXT_ULL(KMP_EXPAND_NAME(KMP_API_NAME_GOMP_LOOP_ULL_ORDERED_RUNTIME_NEXT),
 #if OMPT_SUPPORT && OMPT_OPTIONAL
 
 #define OMPT_LOOP_PRE()                                                        \
-  ompt_frame_t *parent_frame;                                                  \
+  omp_frame_t *parent_frame;                                                   \
   if (ompt_enabled.enabled) {                                                  \
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);   \
     parent_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);                     \
@@ -1070,7 +1070,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_PARALLEL_SECTIONS_START)(
   int gtid = __kmp_entry_gtid();
 
 #if OMPT_SUPPORT
-  ompt_frame_t *parent_frame;
+  omp_frame_t *parent_frame;
 
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &parent_frame, NULL, NULL);
@@ -1110,7 +1110,7 @@ void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_SECTIONS_END)(void) {
   KA_TRACE(20, ("GOMP_sections_end: T#%d\n", gtid))
 
 #if OMPT_SUPPORT
-  ompt_frame_t *ompt_frame;
+  omp_frame_t *ompt_frame;
   if (ompt_enabled.enabled) {
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = OMPT_GET_FRAME_ADDRESS(1);
diff --git a/runtime/src/kmp_io.cpp b/runtime/src/kmp_io.cpp
index 91ebefe5e..4f58ea0e5 100644
--- a/runtime/src/kmp_io.cpp
+++ b/runtime/src/kmp_io.cpp
@@ -152,9 +152,7 @@ void __kmp_vprintf(enum kmp_io __kmp_io, char const *format, va_list ap) {
 
   if (__kmp_debug_buf && __kmp_debug_buffer != NULL) {
 
-    int dc = (__kmp_debug_buf_atomic ? KMP_TEST_THEN_INC32(&__kmp_debug_count)
-                                     : __kmp_debug_count++) %
-             __kmp_debug_buf_lines;
+    int dc = __kmp_debug_count++ % __kmp_debug_buf_lines;
     char *db = &__kmp_debug_buffer[dc * __kmp_debug_buf_chars];
     int chars = 0;
 
diff --git a/runtime/src/kmp_lock.cpp b/runtime/src/kmp_lock.cpp
index fb9edb172..accdba83b 100644
--- a/runtime/src/kmp_lock.cpp
+++ b/runtime/src/kmp_lock.cpp
@@ -19,6 +19,7 @@
 #include "kmp_io.h"
 #include "kmp_itt.h"
 #include "kmp_lock.h"
+#include "kmp_wait_release.h"
 
 #include "tsan_annotations.h"
 
@@ -69,7 +70,7 @@ void __kmp_validate_locks(void) {
 // entire 8 bytes were allocated for nested locks on all 64-bit platforms.
 
 static kmp_int32 __kmp_get_tas_lock_owner(kmp_tas_lock_t *lck) {
-  return KMP_LOCK_STRIP(TCR_4(lck->lk.poll)) - 1;
+  return KMP_LOCK_STRIP(KMP_ATOMIC_LD_RLX(&lck->lk.poll)) - 1;
 }
 
 static inline bool __kmp_is_tas_lock_nestable(kmp_tas_lock_t *lck) {
@@ -81,15 +82,17 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   KMP_MB();
 
 #ifdef USE_LOCK_PROFILE
-  kmp_uint32 curr = KMP_LOCK_STRIP(TCR_4(lck->lk.poll));
+  kmp_uint32 curr = KMP_LOCK_STRIP(lck->lk.poll);
   if ((curr != 0) && (curr != gtid + 1))
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
 
-  if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
-      KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
-                                  KMP_LOCK_BUSY(gtid + 1, tas))) {
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     KMP_FSYNC_ACQUIRED(lck);
     return KMP_LOCK_ACQUIRED_FIRST;
   }
@@ -104,10 +107,8 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
   }
 
   kmp_backoff_t backoff = __kmp_spin_backoff_params;
-  while ((lck->lk.poll != KMP_LOCK_FREE(tas)) ||
-         (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
-                                       KMP_LOCK_BUSY(gtid + 1, tas)))) {
-
+  while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
+         !__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     __kmp_spin_backoff(&backoff);
     if (TCR_4(__kmp_nth) >
         (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {
@@ -140,9 +141,10 @@ static int __kmp_acquire_tas_lock_with_checks(kmp_tas_lock_t *lck,
 }
 
 int __kmp_test_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
-  if ((lck->lk.poll == KMP_LOCK_FREE(tas)) &&
-      KMP_COMPARE_AND_STORE_ACQ32(&(lck->lk.poll), KMP_LOCK_FREE(tas),
-                                  KMP_LOCK_BUSY(gtid + 1, tas))) {
+  kmp_int32 tas_free = KMP_LOCK_FREE(tas);
+  kmp_int32 tas_busy = KMP_LOCK_BUSY(gtid + 1, tas);
+  if (KMP_ATOMIC_LD_RLX(&lck->lk.poll) == tas_free &&
+      __kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy)) {
     KMP_FSYNC_ACQUIRED(lck);
     return TRUE;
   }
@@ -164,7 +166,7 @@ int __kmp_release_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid) {
 
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_TAS_RELEASED(lck);
-  KMP_ST_REL32(&(lck->lk.poll), KMP_LOCK_FREE(tas));
+  KMP_ATOMIC_ST_REL(&lck->lk.poll, KMP_LOCK_FREE(tas));
   KMP_MB(); /* Flush all pending memory write invalidates.  */
 
   KMP_YIELD(TCR_4(__kmp_nth) >
@@ -191,7 +193,7 @@ static int __kmp_release_tas_lock_with_checks(kmp_tas_lock_t *lck,
 }
 
 void __kmp_init_tas_lock(kmp_tas_lock_t *lck) {
-  TCW_4(lck->lk.poll, KMP_LOCK_FREE(tas));
+  lck->lk.poll = KMP_LOCK_FREE(tas);
 }
 
 static void __kmp_init_tas_lock_with_checks(kmp_tas_lock_t *lck) {
@@ -2279,7 +2281,7 @@ static void __kmp_destroy_adaptive_lock_with_checks(kmp_adaptive_lock_t *lck) {
 /* "DRDPA" means Dynamically Reconfigurable Distributed Polling Area */
 
 static kmp_int32 __kmp_get_drdpa_lock_owner(kmp_drdpa_lock_t *lck) {
-  return TCR_4(lck->lk.owner_id) - 1;
+  return lck->lk.owner_id - 1;
 }
 
 static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
@@ -2288,13 +2290,12 @@ static inline bool __kmp_is_drdpa_lock_nestable(kmp_drdpa_lock_t *lck) {
 
 __forceinline static int
 __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
-  kmp_uint64 ticket =
-      KMP_TEST_THEN_INC64(RCAST(volatile kmp_int64 *, &lck->lk.next_ticket));
-  kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
+  kmp_uint64 ticket = KMP_ATOMIC_INC(&lck->lk.next_ticket);
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
 
 #ifdef USE_LOCK_PROFILE
-  if (TCR_8(polls[ticket & mask].poll) != ticket)
+  if (polls[ticket & mask] != ticket)
     __kmp_printf("LOCK CONTENTION: %p\n", lck);
 /* else __kmp_printf( "." );*/
 #endif /* USE_LOCK_PROFILE */
@@ -2311,7 +2312,7 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
 
   KMP_FSYNC_PREPARE(lck);
   KMP_INIT_YIELD(spins);
-  while (TCR_8(polls[ticket & mask].poll) < ticket) { // volatile load
+  while (polls[ticket & mask] < ticket) { // atomic load
     // If we are oversubscribed,
     // or have waited a bit (and KMP_LIBRARY=turnaround), then yield.
     // CPU Pause is in the macros for yield.
@@ -2327,8 +2328,8 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
     // If another thread picks reconfigures the polling area and updates their
     // values, and we get the new value of mask and the old polls pointer, we
     // could access memory beyond the end of the old polling area.
-    mask = TCR_8(lck->lk.mask); // volatile load
-    polls = lck->lk.polls; // volatile load
+    mask = lck->lk.mask; // atomic load
+    polls = lck->lk.polls; // atomic load
   }
 
   // Critical section starts here
@@ -2343,7 +2344,7 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // The >= check is in case __kmp_test_drdpa_lock() allocated the cleanup
   // ticket.
   if ((lck->lk.old_polls != NULL) && (ticket >= lck->lk.cleanup_ticket)) {
-    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
+    __kmp_free(lck->lk.old_polls);
     lck->lk.old_polls = NULL;
     lck->lk.cleanup_ticket = 0;
   }
@@ -2353,7 +2354,7 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // previous reconfiguration, let a later thread reconfigure it.
   if (lck->lk.old_polls == NULL) {
     bool reconfigure = false;
-    volatile struct kmp_base_drdpa_lock::kmp_lock_poll *old_polls = polls;
+    std::atomic<kmp_uint64> *old_polls = polls;
     kmp_uint32 num_polls = TCR_4(lck->lk.num_polls);
 
     if (TCR_4(__kmp_nth) >
@@ -2365,9 +2366,9 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
         num_polls = TCR_4(lck->lk.num_polls);
         mask = 0;
         num_polls = 1;
-        polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
-            __kmp_allocate(num_polls * sizeof(*polls));
-        polls[0].poll = ticket;
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
+        polls[0] = ticket;
       }
     } else {
       // We are in under/fully subscribed mode.  Check the number of
@@ -2386,11 +2387,11 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
         // of the old polling area to the new area.  __kmp_allocate()
         // zeroes the memory it allocates, and most of the old area is
         // just zero padding, so we only copy the release counters.
-        polls = (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)
-            __kmp_allocate(num_polls * sizeof(*polls));
+        polls = (std::atomic<kmp_uint64> *)__kmp_allocate(num_polls *
+                                                          sizeof(*polls));
         kmp_uint32 i;
         for (i = 0; i < old_num_polls; i++) {
-          polls[i].poll = old_polls[i].poll;
+          polls[i].store(old_polls[i]);
         }
       }
     }
@@ -2409,13 +2410,13 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
                       "lock %p to %d polls\n",
                       ticket, lck, num_polls));
 
-      lck->lk.old_polls = old_polls; // non-volatile store
-      lck->lk.polls = polls; // volatile store
+      lck->lk.old_polls = old_polls;
+      lck->lk.polls = polls; // atomic store
 
       KMP_MB();
 
-      lck->lk.num_polls = num_polls; // non-volatile store
-      lck->lk.mask = mask; // volatile store
+      lck->lk.num_polls = num_polls;
+      lck->lk.mask = mask; // atomic store
 
       KMP_MB();
 
@@ -2423,7 +2424,7 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
       // to main memory can we update the cleanup ticket field.
       //
       // volatile load / non-volatile store
-      lck->lk.cleanup_ticket = TCR_8(lck->lk.next_ticket);
+      lck->lk.cleanup_ticket = lck->lk.next_ticket;
     }
   }
   return KMP_LOCK_ACQUIRED_FIRST;
@@ -2457,13 +2458,13 @@ static int __kmp_acquire_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
 int __kmp_test_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // First get a ticket, then read the polls pointer and the mask.
   // The polls pointer must be read before the mask!!! (See above)
-  kmp_uint64 ticket = TCR_8(lck->lk.next_ticket); // volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
-  kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
-  if (TCR_8(polls[ticket & mask].poll) == ticket) {
+  kmp_uint64 ticket = lck->lk.next_ticket; // atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls;
+  kmp_uint64 mask = lck->lk.mask; // atomic load
+  if (polls[ticket & mask] == ticket) {
     kmp_uint64 next_ticket = ticket + 1;
-    if (KMP_COMPARE_AND_STORE_ACQ64(&lck->lk.next_ticket, ticket,
-                                    next_ticket)) {
+    if (__kmp_atomic_compare_store_acq(&lck->lk.next_ticket, ticket,
+                                       next_ticket)) {
       KMP_FSYNC_ACQUIRED(lck);
       KA_TRACE(1000, ("__kmp_test_drdpa_lock: ticket #%lld acquired lock %p\n",
                       ticket, lck));
@@ -2502,14 +2503,14 @@ static int __kmp_test_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck,
 int __kmp_release_drdpa_lock(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
   // Read the ticket value from the lock data struct, then the polls pointer and
   // the mask.  The polls pointer must be read before the mask!!! (See above)
-  kmp_uint64 ticket = lck->lk.now_serving + 1; // non-volatile load
-  volatile struct kmp_base_drdpa_lock::kmp_lock_poll *polls = lck->lk.polls;
-  kmp_uint64 mask = TCR_8(lck->lk.mask); // volatile load
+  kmp_uint64 ticket = lck->lk.now_serving + 1; // non-atomic load
+  std::atomic<kmp_uint64> *polls = lck->lk.polls; // atomic load
+  kmp_uint64 mask = lck->lk.mask; // atomic load
   KA_TRACE(1000, ("__kmp_release_drdpa_lock: ticket #%lld released lock %p\n",
                   ticket - 1, lck));
   KMP_FSYNC_RELEASING(lck);
   ANNOTATE_DRDPA_RELEASED(lck);
-  KMP_ST_REL64(&(polls[ticket & mask].poll), ticket); // volatile store
+  polls[ticket & mask] = ticket; // atomic store
   return KMP_LOCK_RELEASED;
 }
 
@@ -2538,9 +2539,8 @@ void __kmp_init_drdpa_lock(kmp_drdpa_lock_t *lck) {
   lck->lk.location = NULL;
   lck->lk.mask = 0;
   lck->lk.num_polls = 1;
-  lck->lk.polls =
-      (volatile struct kmp_base_drdpa_lock::kmp_lock_poll *)__kmp_allocate(
-          lck->lk.num_polls * sizeof(*(lck->lk.polls)));
+  lck->lk.polls = (std::atomic<kmp_uint64> *)__kmp_allocate(
+      lck->lk.num_polls * sizeof(*(lck->lk.polls)));
   lck->lk.cleanup_ticket = 0;
   lck->lk.old_polls = NULL;
   lck->lk.next_ticket = 0;
@@ -2559,12 +2559,12 @@ static void __kmp_init_drdpa_lock_with_checks(kmp_drdpa_lock_t *lck) {
 void __kmp_destroy_drdpa_lock(kmp_drdpa_lock_t *lck) {
   lck->lk.initialized = NULL;
   lck->lk.location = NULL;
-  if (lck->lk.polls != NULL) {
-    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.polls));
+  if (lck->lk.polls.load() != NULL) {
+    __kmp_free(lck->lk.polls.load());
     lck->lk.polls = NULL;
   }
   if (lck->lk.old_polls != NULL) {
-    __kmp_free(CCAST(kmp_base_drdpa_lock::kmp_lock_poll *, lck->lk.old_polls));
+    __kmp_free(lck->lk.old_polls);
     lck->lk.old_polls = NULL;
   }
   lck->lk.mask = 0;
diff --git a/runtime/src/kmp_lock.h b/runtime/src/kmp_lock.h
index 78cf0ccc6..d5e0a8fc8 100644
--- a/runtime/src/kmp_lock.h
+++ b/runtime/src/kmp_lock.h
@@ -121,7 +121,7 @@ extern void __kmp_validate_locks(void);
 
 struct kmp_base_tas_lock {
   // KMP_LOCK_FREE(tas) => unlocked; locked: (gtid+1) of owning thread
-  volatile kmp_int32 poll;
+  std::atomic<kmp_int32> poll;
   kmp_int32 depth_locked; // depth locked, for nested locks only
 };
 
@@ -139,7 +139,7 @@ typedef union kmp_tas_lock kmp_tas_lock_t;
 //    kmp_tas_lock_t xlock = KMP_TAS_LOCK_INITIALIZER( xlock );
 #define KMP_TAS_LOCK_INITIALIZER(lock)                                         \
   {                                                                            \
-    { KMP_LOCK_FREE(tas), 0 }                                                  \
+    { ATOMIC_VAR_INIT(KMP_LOCK_FREE(tas)), 0 }                                 \
   }
 
 extern int __kmp_acquire_tas_lock(kmp_tas_lock_t *lck, kmp_int32 gtid);
@@ -446,17 +446,17 @@ struct kmp_base_drdpa_lock {
   volatile union kmp_drdpa_lock
       *initialized; // points to the lock union if in initialized state
   ident_t const *location; // Source code location of omp_init_lock().
-  volatile struct kmp_lock_poll { kmp_uint64 poll; } * volatile polls;
-  volatile kmp_uint64 mask; // is 2**num_polls-1 for mod op
+  std::atomic<std::atomic<kmp_uint64> *> polls;
+  std::atomic<kmp_uint64> mask; // is 2**num_polls-1 for mod op
   kmp_uint64 cleanup_ticket; // thread with cleanup ticket
-  volatile struct kmp_lock_poll *old_polls; // will deallocate old_polls
+  std::atomic<kmp_uint64> *old_polls; // will deallocate old_polls
   kmp_uint32 num_polls; // must be power of 2
 
   // next_ticket it needs to exist in a separate cache line, as it is
   // invalidated every time a thread takes a new ticket.
   KMP_ALIGN_CACHE
 
-  volatile kmp_uint64 next_ticket;
+  std::atomic<kmp_uint64> next_ticket;
 
   // now_serving is used to store our ticket value while we hold the lock. It
   // has a slightly different meaning in the DRDPA ticket locks (where it is
@@ -518,6 +518,8 @@ extern void __kmp_destroy_nested_drdpa_lock(kmp_drdpa_lock_t *lck);
 typedef kmp_ticket_lock_t kmp_bootstrap_lock_t;
 
 #define KMP_BOOTSTRAP_LOCK_INITIALIZER(lock) KMP_TICKET_LOCK_INITIALIZER((lock))
+#define KMP_BOOTSTRAP_LOCK_INIT(lock)                                          \
+  kmp_bootstrap_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
 
 static inline int __kmp_acquire_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
   return __kmp_acquire_ticket_lock(lck, KMP_GTID_DNE);
@@ -549,6 +551,8 @@ static inline void __kmp_destroy_bootstrap_lock(kmp_bootstrap_lock_t *lck) {
 
 typedef kmp_ticket_lock_t kmp_lock_t;
 
+#define KMP_LOCK_INIT(lock) kmp_lock_t lock = KMP_TICKET_LOCK_INITIALIZER(lock)
+
 static inline int __kmp_acquire_lock(kmp_lock_t *lck, kmp_int32 gtid) {
   return __kmp_acquire_ticket_lock(lck, gtid);
 }
@@ -644,8 +648,8 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
         KMP_FATAL(LockIsAlreadyOwned, func);                                   \
       }                                                                        \
     }                                                                          \
-    if ((lck->tas.lk.poll != 0) ||                                             \
-        (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1))) {    \
+    if (lck->tas.lk.poll != 0 ||                                               \
+        !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {        \
       kmp_uint32 spins;                                                        \
       KMP_FSYNC_PREPARE(lck);                                                  \
       KMP_INIT_YIELD(spins);                                                   \
@@ -655,9 +659,8 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
       } else {                                                                 \
         KMP_YIELD_SPIN(spins);                                                 \
       }                                                                        \
-      while (                                                                  \
-          (lck->tas.lk.poll != 0) ||                                           \
-          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1))) {  \
+      while (lck->tas.lk.poll != 0 ||                                          \
+             !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {   \
         if (TCR_4(__kmp_nth) >                                                 \
             (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {             \
           KMP_YIELD(TRUE);                                                     \
@@ -699,7 +702,7 @@ static inline int __kmp_test_user_lock_with_checks(kmp_user_lock_p lck,
       }
     }
     return ((lck->tas.lk.poll == 0) &&
-            KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1));
+            __kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
   } else {
     KMP_DEBUG_ASSERT(__kmp_test_user_lock_with_checks_ != NULL);
     return (*__kmp_test_user_lock_with_checks_)(lck, gtid);
@@ -764,7 +767,7 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
       *depth = KMP_LOCK_ACQUIRED_NEXT;                                         \
     } else {                                                                   \
       if ((lck->tas.lk.poll != 0) ||                                           \
-          (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1))) {  \
+          !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) {      \
         kmp_uint32 spins;                                                      \
         KMP_FSYNC_PREPARE(lck);                                                \
         KMP_INIT_YIELD(spins);                                                 \
@@ -775,8 +778,7 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
           KMP_YIELD_SPIN(spins);                                               \
         }                                                                      \
         while ((lck->tas.lk.poll != 0) ||                                      \
-               (!KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0,           \
-                                             gtid + 1))) {                     \
+               !__kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
           if (TCR_4(__kmp_nth) >                                               \
               (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) {           \
             KMP_YIELD(TRUE);                                                   \
@@ -824,7 +826,7 @@ static inline int __kmp_test_nested_user_lock_with_checks(kmp_user_lock_p lck,
       return ++lck->tas.lk.depth_locked; /* same owner, depth increased */
     }
     retval = ((lck->tas.lk.poll == 0) &&
-              KMP_COMPARE_AND_STORE_ACQ32(&(lck->tas.lk.poll), 0, gtid + 1));
+              __kmp_compare_and_store_acq(&lck->tas.lk.poll, 0, gtid + 1));
     if (retval) {
       KMP_MB();
       lck->tas.lk.depth_locked = 1;
diff --git a/runtime/src/kmp_omp.h b/runtime/src/kmp_omp.h
index 2d6fd195c..8dcb7d263 100644
--- a/runtime/src/kmp_omp.h
+++ b/runtime/src/kmp_omp.h
@@ -74,7 +74,9 @@ typedef struct {
   addr_and_size_t threads; // Pointer to __kmp_threads.
   addr_and_size_t roots; // Pointer to __kmp_root.
   addr_and_size_t capacity; // Pointer to __kmp_threads_capacity.
+#if KMP_USE_MONITOR
   addr_and_size_t monitor; // Pointer to __kmp_monitor.
+#endif
 #if !KMP_USE_DYNAMIC_LOCK
   addr_and_size_t lock_table; // Pointer to __kmp_lock_table.
 #endif
diff --git a/runtime/src/kmp_os.h b/runtime/src/kmp_os.h
index 9ad023cf5..60db7af3c 100644
--- a/runtime/src/kmp_os.h
+++ b/runtime/src/kmp_os.h
@@ -16,6 +16,7 @@
 
 #include "kmp_config.h"
 #include <stdlib.h>
+#include <atomic>
 
 #define KMP_FTN_PLAIN 1
 #define KMP_FTN_APPEND 2
@@ -905,6 +906,45 @@ enum kmp_warnings_level {
 } // extern "C"
 #endif // __cplusplus
 
+// Macros for C++11 atomic functions
+#define KMP_ATOMIC_LD(p, order) (p)->load(std::memory_order_##order)
+#define KMP_ATOMIC_OP(op, p, v, order) (p)->op(v, std::memory_order_##order)
+
+// For non-default load/store
+#define KMP_ATOMIC_LD_ACQ(p) KMP_ATOMIC_LD(p, acquire)
+#define KMP_ATOMIC_LD_RLX(p) KMP_ATOMIC_LD(p, relaxed)
+#define KMP_ATOMIC_ST_REL(p, v) KMP_ATOMIC_OP(store, p, v, release)
+#define KMP_ATOMIC_ST_RLX(p, v) KMP_ATOMIC_OP(store, p, v, relaxed)
+
+// For non-default fetch_<op>
+#define KMP_ATOMIC_ADD(p, v) KMP_ATOMIC_OP(fetch_add, p, v, acq_rel)
+#define KMP_ATOMIC_SUB(p, v) KMP_ATOMIC_OP(fetch_sub, p, v, acq_rel)
+#define KMP_ATOMIC_AND(p, v) KMP_ATOMIC_OP(fetch_and, p, v, acq_rel)
+#define KMP_ATOMIC_OR(p, v) KMP_ATOMIC_OP(fetch_or, p, v, acq_rel)
+#define KMP_ATOMIC_INC(p) KMP_ATOMIC_OP(fetch_add, p, 1, acq_rel)
+#define KMP_ATOMIC_DEC(p) KMP_ATOMIC_OP(fetch_sub, p, 1, acq_rel)
+#define KMP_ATOMIC_ADD_RLX(p, v) KMP_ATOMIC_OP(fetch_add, p, v, relaxed)
+#define KMP_ATOMIC_INC_RLX(p) KMP_ATOMIC_OP(fetch_add, p, 1, relaxed)
+
+// Callers of the following functions cannot see the side effect on "expected".
+template <typename T>
+bool __kmp_atomic_compare_store(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acq_rel, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_acq(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_acquire, std::memory_order_relaxed);
+}
+
+template <typename T>
+bool __kmp_atomic_compare_store_rel(std::atomic<T> *p, T expected, T desired) {
+  return p->compare_exchange_strong(
+      expected, desired, std::memory_order_release, std::memory_order_relaxed);
+}
+
 #endif /* KMP_OS_H */
 // Safe C API
 #include "kmp_safe_c_api.h"
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index e175e429d..e611f8ff7 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -24,6 +24,10 @@
 #include "kmp_str.h"
 #include "kmp_wait_release.h"
 #include "kmp_wrapper_getpid.h"
+#include "kmp_dispatch.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 
 #if OMPT_SUPPORT
 #include "ompt-specific.h"
@@ -66,7 +70,9 @@ char const __kmp_version_lock[] =
 
 /* ------------------------------------------------------------------------ */
 
+#if KMP_USE_MONITOR
 kmp_info_t __kmp_monitor;
+#endif
 
 /* Forward declarations */
 
@@ -757,8 +763,8 @@ int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
        single block */
     /* TODO: Should this be acquire or release? */
     if (team->t.t_construct == old_this) {
-      status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
-                                           th->th.th_local.this_construct);
+      status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
+                                              th->th.th_local.this_construct);
     }
 #if USE_ITT_BUILD
     if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
@@ -1396,6 +1402,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     /* OMPT state */
     this_thr->th.ompt_thread_info.state = omp_state_work_parallel;
     OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1);
+    OMPT_CUR_TASK_INFO(this_thr)->scheduling_parent = this_thr->th.th_current_task->td_parent;
   }
 #endif
 }
@@ -1465,7 +1472,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     ompt_data_t ompt_parallel_data;
     ompt_parallel_data.ptr = NULL;
     ompt_data_t *parent_task_data;
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     ompt_data_t *implicit_task_data;
     void *return_address = NULL;
 
@@ -1559,6 +1566,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
             OMPT_CUR_TASK_INFO(master_th)
                 ->thread_num = __kmp_tid_from_gtid(gtid);
           }
+          OMPT_CUR_TASK_INFO(master_th)->scheduling_parent = master_th->th.th_current_task->td_parent;
 
           /* OMPT state */
           master_th->th.ompt_thread_info.state = omp_state_work_parallel;
@@ -1610,7 +1618,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 
       parent_team->t.t_pkfn = microtask;
       parent_team->t.t_invoke = invoker;
-      KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
       parent_team->t.t_active_level++;
       parent_team->t.t_level++;
 
@@ -1780,6 +1788,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
               OMPT_CUR_TASK_INFO(master_th)
                   ->thread_num = __kmp_tid_from_gtid(gtid);
             }
+            OMPT_CUR_TASK_INFO(master_th)->scheduling_parent = master_th->th.th_current_task->td_parent;
 
             /* OMPT state */
             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
@@ -1885,6 +1894,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
               OMPT_CUR_TASK_INFO(master_th)
                   ->thread_num = __kmp_tid_from_gtid(gtid);
             }
+            OMPT_CUR_TASK_INFO(master_th)->scheduling_parent = master_th->th.th_current_task->td_parent;
 
             /* OMPT state */
             master_th->th.ompt_thread_info.state = omp_state_work_parallel;
@@ -1967,7 +1977,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
 #endif /* OMP_40_ENABLED */
     {
       /* Increment our nested depth level */
-      KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel);
+      KMP_ATOMIC_INC(&root->r.r_in_parallel);
     }
 
     // See if we need to make a copy of the ICVs.
@@ -2444,7 +2454,7 @@ void __kmp_join_call(ident_t *loc, int gtid
     /* Decrement our nested depth level */
     team->t.t_level--;
     team->t.t_active_level--;
-    KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
 
     /* Restore number of threads in the team if needed */
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
@@ -2502,7 +2512,7 @@ void __kmp_join_call(ident_t *loc, int gtid
 #endif /* OMP_40_ENABLED */
   {
     /* Decrement our nested depth level */
-    KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel);
+    KMP_ATOMIC_DEC(&root->r.r_in_parallel);
   }
   KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
 
@@ -3081,6 +3091,9 @@ static void __kmp_free_team_arrays(kmp_team_t *team) {
       team->t.t_dispatch[i].th_disp_buffer = NULL;
     }
   }
+#if KMP_USE_HIER_SCHED
+  __kmp_dispatch_free_hierarchies(team);
+#endif
   __kmp_free(team->t.t_threads);
   __kmp_free(team->t.t_disp_buffer);
   __kmp_free(team->t.t_dispatch);
@@ -3399,7 +3412,8 @@ void __kmp_print_structure(void) {
                                      root->r.r_uber_thread);
         __kmp_printf("    Active?:      %2d\n", root->r.r_active);
         __kmp_printf("    Nested?:      %2d\n", root->r.r_nested);
-        __kmp_printf("    In Parallel:  %2d\n", root->r.r_in_parallel);
+        __kmp_printf("    In Parallel:  %2d\n",
+                     KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
         __kmp_printf("\n");
         __kmp_print_structure_team_accum(list, root->r.r_root_team);
         __kmp_print_structure_team_accum(list, root->r.r_hot_team);
@@ -3833,6 +3847,10 @@ int __kmp_register_root(int initial_thread) {
     ompt_set_thread_state(root_thread, omp_state_work_serial);
   }
 #endif
+#if OMPD_SUPPORT
+    if ( ompd_state & OMPD_ENABLE_BP )
+        ompd_bp_thread_begin ();
+#endif
 
   KMP_MB();
   __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
@@ -3916,6 +3934,11 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
   __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
 #endif /* KMP_OS_WINDOWS */
 
+#if OMPD_SUPPORT
+    if ( ompd_state & OMPD_ENABLE_BP )
+        ompd_bp_thread_end ();
+#endif
+
 #if OMPT_SUPPORT
   if (ompt_enabled.ompt_callback_thread_end) {
     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
@@ -4459,7 +4482,9 @@ static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
 #ifdef KMP_DEBUG
   team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
 #endif
+#if KMP_OS_WINDOWS
   team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
+#endif
 
   team->t.t_control_stack_top = NULL;
 
@@ -5357,7 +5382,9 @@ void __kmp_free_team(kmp_root_t *root,
   /* team is done working */
   TCW_SYNC_PTR(team->t.t_pkfn,
                NULL); // Important for Debugging Support Library.
+#if KMP_OS_WINDOWS
   team->t.t_copyin_counter = 0; // init counter for possible reuse
+#endif
   // Do not reset pointer to parent team to NULL for hot teams.
 
   /* if we are non-hot team, release our threads */
@@ -5591,6 +5618,11 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   }
 #endif
 
+#if OMPD_SUPPORT
+    if ( ompd_state & OMPD_ENABLE_BP )
+        ompd_bp_thread_begin ();
+#endif
+
 #if OMPT_SUPPORT
   if (ompt_enabled.enabled) {
     this_thr->th.ompt_thread_info.state = omp_state_idle;
@@ -5659,6 +5691,11 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) {
   }
   TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
 
+#if OMPD_SUPPORT
+    if ( ompd_state & OMPD_ENABLE_BP )
+        ompd_bp_thread_end ();
+#endif
+
 #if OMPT_SUPPORT
   if (ompt_enabled.ompt_callback_thread_end) {
     ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
@@ -5794,8 +5831,8 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
     // so there are no harmful side effects.
     if (thread->th.th_active_in_pool) {
       thread->th.th_active_in_pool = FALSE;
-      KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
-      KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
+      KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
+      KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
     }
 
     // Decrement # of [worker] threads in the pool.
@@ -5860,6 +5897,13 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
   }
 #endif /* KMP_AFFINITY_SUPPORTED */
 
+#if KMP_USE_HIER_SCHED
+  if (thread->th.th_hier_bar_data != NULL) {
+    __kmp_free(thread->th.th_hier_bar_data);
+    thread->th.th_hier_bar_data = NULL;
+  }
+#endif
+
   __kmp_reap_team(thread->th.th_serial_team);
   thread->th.th_serial_team = NULL;
   __kmp_free(thread);
@@ -7254,7 +7298,7 @@ static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
   // executing thread (to become the master) are available to add to the new
   // team, but are currently contributing to the system load, and must be
   // accounted for.
-  pool_active = TCR_4(__kmp_thread_pool_active_nth);
+  pool_active = __kmp_thread_pool_active_nth;
   hot_team_active = __kmp_active_hot_team_nproc(root);
   team_curr_active = pool_active + hot_team_active + 1;
 
@@ -7378,6 +7422,10 @@ void __kmp_cleanup(void) {
 
   __kmp_i18n_catclose();
 
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+
 #if KMP_STATS_ENABLED
   __kmp_stats_fini();
 #endif
diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp
index d2502d0ad..71a4c592b 100644
--- a/runtime/src/kmp_settings.cpp
+++ b/runtime/src/kmp_settings.cpp
@@ -14,6 +14,9 @@
 #include "kmp.h"
 #include "kmp_affinity.h"
 #include "kmp_atomic.h"
+#if KMP_USE_HIER_SCHED
+#include "kmp_dispatch_hier.h"
+#endif
 #include "kmp_environment.h"
 #include "kmp_i18n.h"
 #include "kmp_io.h"
@@ -3425,72 +3428,152 @@ static void __kmp_stg_print_schedule(kmp_str_buf_t *buffer, char const *name,
 // -----------------------------------------------------------------------------
 // OMP_SCHEDULE
 
+static inline void __kmp_omp_schedule_restore() {
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.deallocate();
+#endif
+  __kmp_chunk = 0;
+  __kmp_sched = kmp_sch_default;
+}
+
+static const char *__kmp_parse_single_omp_schedule(const char *name,
+                                                   const char *value,
+                                                   bool parse_hier = false) {
+  /* get the specified scheduling style */
+  const char *ptr = value;
+  const char *comma = strchr(ptr, ',');
+  const char *delim;
+  int chunk = 0;
+  enum sched_type sched = kmp_sch_default;
+  if (*ptr == '\0')
+    return NULL;
+#if KMP_USE_HIER_SCHED
+  kmp_hier_layer_e layer = kmp_hier_layer_e::LAYER_THREAD;
+  if (parse_hier) {
+    if (!__kmp_strcasecmp_with_sentinel("L1", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_L1;
+    } else if (!__kmp_strcasecmp_with_sentinel("L2", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_L2;
+    } else if (!__kmp_strcasecmp_with_sentinel("L3", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_L3;
+    } else if (!__kmp_strcasecmp_with_sentinel("NUMA", ptr, ',')) {
+      layer = kmp_hier_layer_e::LAYER_NUMA;
+    }
+    if (layer != kmp_hier_layer_e::LAYER_THREAD && !comma) {
+      // If there is no comma after the layer, then this schedule is invalid
+      KMP_WARNING(StgInvalidValue, name, value);
+      __kmp_omp_schedule_restore();
+      return NULL;
+    } else if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+      ptr = ++comma;
+      comma = strchr(ptr, ',');
+    }
+  }
+  delim = ptr;
+  while (*delim != ',' && *delim != ':' && *delim != '\0')
+    delim++;
+#else // KMP_USE_HIER_SCHED
+  delim = ptr;
+  while (*delim != ',' && *delim != '\0')
+    delim++;
+#endif // KMP_USE_HIER_SCHED
+  if (!__kmp_strcasecmp_with_sentinel("dynamic", ptr, *delim)) /* DYNAMIC */
+    sched = kmp_sch_dynamic_chunked;
+  else if (!__kmp_strcasecmp_with_sentinel("guided", ptr, *delim)) /* GUIDED */
+    sched = kmp_sch_guided_chunked;
+  // AC: TODO: add AUTO schedule, and probably remove TRAPEZOIDAL (OMP 3.0 does
+  // not allow it)
+  else if (!__kmp_strcasecmp_with_sentinel("auto", ptr, *delim)) { /* AUTO */
+    sched = kmp_sch_auto;
+    if (comma) {
+      __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, comma),
+                __kmp_msg_null);
+      comma = NULL;
+    }
+  } else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", ptr,
+                                             *delim)) /* TRAPEZOIDAL */
+    sched = kmp_sch_trapezoidal;
+  else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim)) /* STATIC */
+    sched = kmp_sch_static;
+#if KMP_STATIC_STEAL_ENABLED
+  else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim))
+    sched = kmp_sch_static_steal;
+#endif
+  else {
+    KMP_WARNING(StgInvalidValue, name, value);
+    __kmp_omp_schedule_restore();
+    return NULL;
+  }
+  if (ptr && comma && *comma == *delim) {
+    ptr = comma + 1;
+    SKIP_DIGITS(ptr);
+
+    if (sched == kmp_sch_static)
+      sched = kmp_sch_static_chunked;
+    ++comma;
+    chunk = __kmp_str_to_int(comma, *ptr);
+    if (chunk < 1) {
+      chunk = KMP_DEFAULT_CHUNK;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, comma),
+                __kmp_msg_null);
+      KMP_INFORM(Using_int_Value, name, __kmp_chunk);
+      // AC: next block commented out until KMP_DEFAULT_CHUNK != KMP_MIN_CHUNK
+      // (to improve code coverage :)
+      //     The default chunk size is 1 according to standard, thus making
+      //     KMP_MIN_CHUNK not 1 we would introduce mess:
+      //     wrong chunk becomes 1, but it will be impossible to explicitely set
+      //     1, because it becomes KMP_MIN_CHUNK...
+      //                } else if ( chunk < KMP_MIN_CHUNK ) {
+      //                    chunk = KMP_MIN_CHUNK;
+    } else if (chunk > KMP_MAX_CHUNK) {
+      chunk = KMP_MAX_CHUNK;
+      __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, comma),
+                __kmp_msg_null);
+      KMP_INFORM(Using_int_Value, name, chunk);
+    }
+  } else if (ptr) {
+    SKIP_TOKEN(ptr);
+  }
+#if KMP_USE_HIER_SCHED
+  if (layer != kmp_hier_layer_e::LAYER_THREAD) {
+    __kmp_hier_scheds.append(sched, chunk, layer);
+  } else
+#endif
+  {
+    __kmp_chunk = chunk;
+    __kmp_sched = sched;
+  }
+  return ptr;
+}
+
 static void __kmp_stg_parse_omp_schedule(char const *name, char const *value,
                                          void *data) {
   size_t length;
+  const char *ptr = value;
+  SKIP_WS(ptr);
   if (value) {
     length = KMP_STRLEN(value);
     if (length) {
-      const char *comma = strchr(value, ',');
       if (value[length - 1] == '"' || value[length - 1] == '\'')
         KMP_WARNING(UnbalancedQuotes, name);
-      /* get the specified scheduling style */
-      if (!__kmp_strcasecmp_with_sentinel("dynamic", value, ',')) /* DYNAMIC */
-        __kmp_sched = kmp_sch_dynamic_chunked;
-      else if (!__kmp_strcasecmp_with_sentinel("guided", value,
-                                               ',')) /* GUIDED */
-        __kmp_sched = kmp_sch_guided_chunked;
-      // AC: TODO: add AUTO schedule, and pprobably remove TRAPEZOIDAL (OMP 3.0
-      // does not allow it)
-      else if (!__kmp_strcasecmp_with_sentinel("auto", value, ',')) { /* AUTO */
-        __kmp_sched = kmp_sch_auto;
-        if (comma) {
-          __kmp_msg(kmp_ms_warning, KMP_MSG(IgnoreChunk, name, comma),
-                    __kmp_msg_null);
-          comma = NULL;
+/* get the specified scheduling style */
+#if KMP_USE_HIER_SCHED
+      if (!__kmp_strcasecmp_with_sentinel("EXPERIMENTAL", ptr, ' ')) {
+        SKIP_TOKEN(ptr);
+        SKIP_WS(ptr);
+        while ((ptr = __kmp_parse_single_omp_schedule(name, ptr, true))) {
+          while (*ptr == ' ' || *ptr == '\t' || *ptr == ':')
+            ptr++;
         }
-      } else if (!__kmp_strcasecmp_with_sentinel("trapezoidal", value,
-                                                 ',')) /* TRAPEZOIDAL */
-        __kmp_sched = kmp_sch_trapezoidal;
-      else if (!__kmp_strcasecmp_with_sentinel("static", value,
-                                               ',')) /* STATIC */
-        __kmp_sched = kmp_sch_static;
-#if KMP_STATIC_STEAL_ENABLED
-      else if (!__kmp_strcasecmp_with_sentinel("static_steal", value, ','))
-        __kmp_sched = kmp_sch_static_steal;
+      } else
 #endif
-      else {
-        KMP_WARNING(StgInvalidValue, name, value);
-        value = NULL; /* skip processing of comma */
-      }
-      if (value && comma) {
-        if (__kmp_sched == kmp_sch_static)
-          __kmp_sched = kmp_sch_static_chunked;
-        ++comma;
-        __kmp_chunk = __kmp_str_to_int(comma, 0);
-        if (__kmp_chunk < 1) {
-          __kmp_chunk = KMP_DEFAULT_CHUNK;
-          __kmp_msg(kmp_ms_warning, KMP_MSG(InvalidChunk, name, comma),
-                    __kmp_msg_null);
-          KMP_INFORM(Using_int_Value, name, __kmp_chunk);
-          // AC: next block commented out until KMP_DEFAULT_CHUNK !=
-          // KMP_MIN_CHUNK (to improve code coverage :)
-          //     The default chunk size is 1 according to standard, thus making
-          //     KMP_MIN_CHUNK not 1 we would introduce mess:
-          //     wrong chunk becomes 1, but it will be impossible to explicitely
-          //     set 1, because it becomes KMP_MIN_CHUNK...
-          //                } else if ( __kmp_chunk < KMP_MIN_CHUNK ) {
-          //                    __kmp_chunk = KMP_MIN_CHUNK;
-        } else if (__kmp_chunk > KMP_MAX_CHUNK) {
-          __kmp_chunk = KMP_MAX_CHUNK;
-          __kmp_msg(kmp_ms_warning, KMP_MSG(LargeChunk, name, comma),
-                    __kmp_msg_null);
-          KMP_INFORM(Using_int_Value, name, __kmp_chunk);
-        }
-      }
+        __kmp_parse_single_omp_schedule(name, ptr);
     } else
       KMP_WARNING(EmptyString, name);
   }
+#if KMP_USE_HIER_SCHED
+  __kmp_hier_scheds.sort();
+#endif
   K_DIAG(1, ("__kmp_static == %d\n", __kmp_static))
   K_DIAG(1, ("__kmp_guided == %d\n", __kmp_guided))
   K_DIAG(1, ("__kmp_sched == %d\n", __kmp_sched))
@@ -3557,6 +3640,20 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
   }
 } // __kmp_stg_print_omp_schedule
 
+#if KMP_USE_HIER_SCHED
+// -----------------------------------------------------------------------------
+// KMP_DISP_HAND_THREAD
+static void __kmp_stg_parse_kmp_hand_thread(char const *name, char const *value,
+                                            void *data) {
+  __kmp_stg_parse_bool(name, value, &(__kmp_dispatch_hand_threading));
+} // __kmp_stg_parse_kmp_hand_thread
+
+static void __kmp_stg_print_kmp_hand_thread(kmp_str_buf_t *buffer,
+                                            char const *name, void *data) {
+  __kmp_stg_print_bool(buffer, name, __kmp_dispatch_hand_threading);
+} // __kmp_stg_print_kmp_hand_thread
+#endif
+
 // -----------------------------------------------------------------------------
 // KMP_ATOMIC_MODE
 
@@ -4626,6 +4723,10 @@ static kmp_setting_t __kmp_stg_table[] = {
      0, 0},
     {"OMP_SCHEDULE", __kmp_stg_parse_omp_schedule, __kmp_stg_print_omp_schedule,
      NULL, 0, 0},
+#if KMP_USE_HIER_SCHED
+    {"KMP_DISP_HAND_THREAD", __kmp_stg_parse_kmp_hand_thread,
+     __kmp_stg_print_kmp_hand_thread, NULL, 0, 0},
+#endif
     {"KMP_ATOMIC_MODE", __kmp_stg_parse_atomic_mode,
      __kmp_stg_print_atomic_mode, NULL, 0, 0},
     {"KMP_CONSISTENCY_CHECK", __kmp_stg_parse_consistency_check,
diff --git a/runtime/src/kmp_stats.h b/runtime/src/kmp_stats.h
index 63ce4a204..f8288de44 100644
--- a/runtime/src/kmp_stats.h
+++ b/runtime/src/kmp_stats.h
@@ -46,8 +46,7 @@
 enum stats_flags_e {
   noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic
   onlyInMaster = 1 << 1, //!< statistic is valid only for master
-  noUnits =
-      1 << 2, //!< statistic doesn't need units printed next to it in output
+  noUnits = 1 << 2, //!< statistic doesn't need units printed next to it
   notInMaster = 1 << 3, //!< statistic is valid only for non-master threads
   logEvent = 1 << 4 //!< statistic can be logged on the event timeline when
   //! KMP_STATS_EVENTS is on (valid only for timers)
@@ -90,20 +89,26 @@ enum stats_state_e {
  */
 // clang-format off
 #define KMP_FOREACH_COUNTER(macro, arg)                                        \
-  macro(OMP_PARALLEL, stats_flags_e::onlyInMaster | stats_flags_e::noTotal,    \
-        arg) macro(OMP_NESTED_PARALLEL, 0, arg) macro(OMP_FOR_static, 0, arg)  \
-      macro(OMP_FOR_static_steal, 0, arg) macro(OMP_FOR_dynamic, 0, arg)       \
-          macro(OMP_DISTRIBUTE, 0, arg) macro(OMP_BARRIER, 0, arg)             \
-              macro(OMP_CRITICAL, 0, arg) macro(OMP_SINGLE, 0, arg)            \
-                  macro(OMP_MASTER, 0, arg) macro(OMP_TEAMS, 0, arg)           \
-                      macro(OMP_set_lock, 0, arg) macro(OMP_test_lock, 0, arg) \
-                          macro(REDUCE_wait, 0, arg)                           \
-                              macro(REDUCE_nowait, 0, arg)                     \
-                                  macro(OMP_TASKYIELD, 0, arg)                 \
-                                      macro(OMP_TASKLOOP, 0, arg)              \
-                                          macro(TASK_executed, 0, arg)         \
-                                              macro(TASK_cancelled, 0, arg)    \
-                                                  macro(TASK_stolen, 0, arg)
+  macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg)   \
+  macro(OMP_NESTED_PARALLEL, 0, arg)                                           \
+  macro(OMP_FOR_static, 0, arg)                                                \
+  macro(OMP_FOR_static_steal, 0, arg)                                          \
+  macro(OMP_FOR_dynamic, 0, arg)                                               \
+  macro(OMP_DISTRIBUTE, 0, arg)                                                \
+  macro(OMP_BARRIER, 0, arg)                                                   \
+  macro(OMP_CRITICAL, 0, arg)                                                  \
+  macro(OMP_SINGLE, 0, arg)                                                    \
+  macro(OMP_MASTER, 0, arg)                                                    \
+  macro(OMP_TEAMS, 0, arg)                                                     \
+  macro(OMP_set_lock, 0, arg)                                                  \
+  macro(OMP_test_lock, 0, arg)                                                 \
+  macro(REDUCE_wait, 0, arg)                                                   \
+  macro(REDUCE_nowait, 0, arg)                                                 \
+  macro(OMP_TASKYIELD, 0, arg)                                                 \
+  macro(OMP_TASKLOOP, 0, arg)                                                  \
+  macro(TASK_executed, 0, arg)                                                 \
+  macro(TASK_cancelled, 0, arg)                                                \
+  macro(TASK_stolen, 0, arg)
 // clang-format on
 
 /*!
@@ -213,20 +218,28 @@ enum stats_state_e {
 // KMP_tree_release       -- time in __kmp_tree_barrier_release
 // KMP_hyper_gather       -- time in __kmp_hyper_barrier_gather
 // KMP_hyper_release      -- time in __kmp_hyper_barrier_release
+// clang-format off
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)                                \
-  macro(KMP_fork_call, 0, arg) macro(KMP_join_call, 0, arg) macro(             \
-      KMP_end_split_barrier, 0, arg) macro(KMP_hier_gather, 0, arg)            \
-      macro(KMP_hier_release, 0, arg) macro(KMP_hyper_gather, 0, arg)          \
-          macro(KMP_hyper_release, 0, arg) macro(KMP_linear_gather, 0, arg)    \
-              macro(KMP_linear_release, 0, arg) macro(KMP_tree_gather, 0, arg) \
-                  macro(KMP_tree_release, 0, arg) macro(USER_resume, 0, arg)   \
-                      macro(USER_suspend, 0, arg)                              \
-                          macro(KMP_allocate_team, 0, arg)                     \
-                              macro(KMP_setup_icv_copy, 0, arg)                \
-                                  macro(USER_icv_copy, 0, arg)
+  macro(KMP_fork_call, 0, arg)                                                 \
+  macro(KMP_join_call, 0, arg)                                                 \
+  macro(KMP_end_split_barrier, 0, arg)                                         \
+  macro(KMP_hier_gather, 0, arg)                                               \
+  macro(KMP_hier_release, 0, arg)                                              \
+  macro(KMP_hyper_gather, 0, arg)                                              \
+  macro(KMP_hyper_release, 0, arg)                                             \
+  macro(KMP_linear_gather, 0, arg)                                             \
+  macro(KMP_linear_release, 0, arg)                                            \
+  macro(KMP_tree_gather, 0, arg)                                               \
+  macro(KMP_tree_release, 0, arg)                                              \
+  macro(USER_resume, 0, arg)                                                   \
+  macro(USER_suspend, 0, arg)                                                  \
+  macro(KMP_allocate_team, 0, arg)                                             \
+  macro(KMP_setup_icv_copy, 0, arg)                                            \
+  macro(USER_icv_copy, 0, arg)
 #else
 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg)
 #endif
+// clang-format on
 
 /*!
  * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro.
diff --git a/runtime/src/kmp_taskdeps.cpp b/runtime/src/kmp_taskdeps.cpp
index dedf022ff..4efd12bac 100644
--- a/runtime/src/kmp_taskdeps.cpp
+++ b/runtime/src/kmp_taskdeps.cpp
@@ -32,7 +32,7 @@
 // TODO: Any ITT support needed?
 
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
-static kmp_int32 kmp_node_id_seed = 0;
+static std::atomic<kmp_int32> kmp_node_id_seed = ATOMIC_VAR_INIT(0);
 #endif
 
 static void __kmp_init_node(kmp_depnode_t *node) {
@@ -40,14 +40,15 @@ static void __kmp_init_node(kmp_depnode_t *node) {
   // task once dependences have been processed
   node->dn.successors = NULL;
   __kmp_init_lock(&node->dn.lock);
-  node->dn.nrefs = 1; // init creates the first reference to the node
+  KMP_ATOMIC_ST_RLX(&node->dn.nrefs,
+                    1); // init creates the first reference to the node
 #ifdef KMP_SUPPORT_GRAPH_OUTPUT
-  node->dn.id = KMP_TEST_THEN_INC32(&kmp_node_id_seed);
+  node->dn.id = KMP_ATOMIC_INC(&kmp_node_id_seed);
 #endif
 }
 
 static inline kmp_depnode_t *__kmp_node_ref(kmp_depnode_t *node) {
-  KMP_TEST_THEN_INC32(CCAST(kmp_int32 *, &node->dn.nrefs));
+  KMP_ATOMIC_INC(&node->dn.nrefs);
   return node;
 }
 
@@ -55,7 +56,7 @@ static inline void __kmp_node_deref(kmp_info_t *thread, kmp_depnode_t *node) {
   if (!node)
     return;
 
-  kmp_int32 n = KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &node->dn.nrefs)) - 1;
+  kmp_int32 n = KMP_ATOMIC_DEC(&node->dn.nrefs) - 1;
   if (n == 0) {
     KMP_ASSERT(node->dn.nrefs == 0);
 #if USE_FAST_MEMORY
@@ -375,9 +376,7 @@ static bool __kmp_check_deps(kmp_int32 gtid, kmp_depnode_t *node,
   // any outstandig dependences (some tasks may have finished while we processed
   // the dependences)
   npredecessors =
-      KMP_TEST_THEN_ADD32(CCAST(kmp_int32 *, &node->dn.npredecessors),
-                          npredecessors) +
-      npredecessors;
+      node->dn.npredecessors.fetch_add(npredecessors) + npredecessors;
 
   KA_TRACE(20, ("__kmp_check_deps: T#%d found %d predecessors for task %p \n",
                 gtid, npredecessors, taskdata));
@@ -413,9 +412,8 @@ void __kmp_release_deps(kmp_int32 gtid, kmp_taskdata_t *task) {
   kmp_depnode_list_t *next;
   for (kmp_depnode_list_t *p = node->dn.successors; p; p = next) {
     kmp_depnode_t *successor = p->node;
-    kmp_int32 npredecessors =
-        KMP_TEST_THEN_DEC32(CCAST(kmp_int32 *, &successor->dn.npredecessors)) -
-        1;
+    kmp_int32 npredecessors = KMP_ATOMIC_DEC(&successor->dn.npredecessors) - 1;
+
     // successor task can be NULL for wait_depends or because deps are still
     // being processed
     if (npredecessors == 0) {
@@ -648,7 +646,7 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
     return;
   }
 
-  kmp_depnode_t node;
+  kmp_depnode_t node = {0};
   __kmp_init_node(&node);
 
   if (!__kmp_check_deps(gtid, &node, NULL, current_task->td_dephash,
@@ -661,12 +659,10 @@ void __kmpc_omp_wait_deps(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps,
   }
 
   int thread_finished = FALSE;
-  kmp_flag_32 flag((volatile kmp_uint32 *)&(node.dn.npredecessors), 0U);
+  kmp_flag_32 flag((std::atomic<kmp_uint32> *)&node.dn.npredecessors, 0U);
   while (node.dn.npredecessors > 0) {
-    flag.execute_tasks(thread, gtid, FALSE, &thread_finished,
-#if USE_ITT_BUILD
-                       NULL,
-#endif
+    flag.execute_tasks(thread, gtid, FALSE,
+                       &thread_finished USE_ITT_BUILD_ARG(NULL),
                        __kmp_task_stealing_constraint);
   }
 
diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp
index 3de44a412..c5ef93074 100644
--- a/runtime/src/kmp_tasking.cpp
+++ b/runtime/src/kmp_tasking.cpp
@@ -264,7 +264,7 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
-    kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
     KA_TRACE(
         20,
         ("__kmp_push_task: T#%d untied_count (%d) incremented for task %p\n",
@@ -528,7 +528,7 @@ static void __kmpc_omp_task_begin_if0_template(ident_t *loc_ref, kmp_int32 gtid,
   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
     // untied task needs to increment counter so that the task structure is not
     // freed prematurely
-    kmp_int32 counter = 1 + KMP_TEST_THEN_INC32(&taskdata->td_untied_count);
+    kmp_int32 counter = 1 + KMP_ATOMIC_INC(&taskdata->td_untied_count);
     KA_TRACE(20, ("__kmpc_omp_task_begin_if0: T#%d untied_count (%d) "
                   "incremented for task %p\n",
                   gtid, counter, taskdata));
@@ -625,9 +625,9 @@ static void __kmp_free_task(kmp_int32 gtid, kmp_taskdata_t *taskdata,
   KMP_DEBUG_ASSERT(taskdata->td_flags.executing == 0);
   KMP_DEBUG_ASSERT(taskdata->td_flags.complete == 1);
   KMP_DEBUG_ASSERT(taskdata->td_flags.freed == 0);
-  KMP_DEBUG_ASSERT(TCR_4(taskdata->td_allocated_child_tasks) == 0 ||
+  KMP_DEBUG_ASSERT(taskdata->td_allocated_child_tasks == 0 ||
                    taskdata->td_flags.task_serial == 1);
-  KMP_DEBUG_ASSERT(TCR_4(taskdata->td_incomplete_child_tasks) == 0);
+  KMP_DEBUG_ASSERT(taskdata->td_incomplete_child_tasks == 0);
 
   taskdata->td_flags.freed = 1;
   ANNOTATE_HAPPENS_BEFORE(taskdata);
@@ -662,8 +662,7 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
 #endif
   KMP_DEBUG_ASSERT(taskdata->td_flags.tasktype == TASK_EXPLICIT);
 
-  kmp_int32 children =
-      KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
+  kmp_int32 children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Now, go up the ancestor tree to see if any ancestors can now be freed.
@@ -685,7 +684,7 @@ static void __kmp_free_task_and_ancestors(kmp_int32 gtid,
       return;
 
     // Predecrement simulated by "- 1" calculation
-    children = KMP_TEST_THEN_DEC32(&taskdata->td_allocated_child_tasks) - 1;
+    children = KMP_ATOMIC_DEC(&taskdata->td_allocated_child_tasks) - 1;
     KMP_DEBUG_ASSERT(children >= 0);
   }
 
@@ -724,7 +723,7 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   if (taskdata->td_flags.tiedness == TASK_UNTIED) {
     // untied task needs to check the counter so that the task structure is not
     // freed prematurely
-    kmp_int32 counter = KMP_TEST_THEN_DEC32(&taskdata->td_untied_count) - 1;
+    kmp_int32 counter = KMP_ATOMIC_DEC(&taskdata->td_untied_count) - 1;
     KA_TRACE(
         20,
         ("__kmp_task_finish: T#%d untied_count (%d) decremented for task %p\n",
@@ -756,12 +755,11 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task,
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
     // Predecrement simulated by "- 1" calculation
     children =
-        KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) -
-        1;
+        KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
     KMP_DEBUG_ASSERT(children >= 0);
 #if OMP_40_ENABLED
     if (taskdata->td_taskgroup)
-      KMP_TEST_THEN_DEC32((kmp_int32 *)(&taskdata->td_taskgroup->count));
+      KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
 #if OMP_45_ENABLED
   }
   // if we found proxy tasks there could exist a dependency chain
@@ -845,7 +843,7 @@ static void __kmpc_omp_task_complete_if0_template(ident_t *loc_ref,
 #if OMPT_SUPPORT
   if (ompt) {
     __ompt_task_finish(task, NULL);
-    ompt_frame_t *ompt_frame;
+    omp_frame_t *ompt_frame;
     __ompt_get_task_info_internal(0, NULL, NULL, &ompt_frame, NULL, NULL);
     ompt_frame->enter_frame = NULL;
   }
@@ -945,9 +943,9 @@ void __kmp_init_implicit_task(ident_t *loc_ref, kmp_info_t *this_thr,
   task->td_last_tied = task;
 
   if (set_curr_task) { // only do this init first time thread is created
-    task->td_incomplete_child_tasks = 0;
+    KMP_ATOMIC_ST_REL(&task->td_incomplete_child_tasks, 0);
     // Not used: don't need to deallocate implicit task
-    task->td_allocated_child_tasks = 0;
+    KMP_ATOMIC_ST_REL(&task->td_allocated_child_tasks, 0);
 #if OMP_40_ENABLED
     task->td_taskgroup = NULL; // An implicit task does not have taskgroup
     task->td_dephash = NULL;
@@ -1130,7 +1128,7 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   taskdata->td_alloc_thread = thread;
   taskdata->td_parent = parent_task;
   taskdata->td_level = parent_task->td_level + 1; // increment nesting level
-  taskdata->td_untied_count = 0;
+  KMP_ATOMIC_ST_RLX(&taskdata->td_untied_count, 0);
   taskdata->td_ident = loc_ref;
   taskdata->td_taskwait_ident = NULL;
   taskdata->td_taskwait_counter = 0;
@@ -1176,9 +1174,9 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
 
   taskdata->td_flags.native = flags->native;
 
-  taskdata->td_incomplete_child_tasks = 0;
-  taskdata->td_allocated_child_tasks = 1; // start at one because counts current
-// task and children
+  KMP_ATOMIC_ST_RLX(&taskdata->td_incomplete_child_tasks, 0);
+  // start at one because counts current task and children
+  KMP_ATOMIC_ST_RLX(&taskdata->td_allocated_child_tasks, 1);
 #if OMP_40_ENABLED
   taskdata->td_taskgroup =
       parent_task->td_taskgroup; // task inherits taskgroup from the parent task
@@ -1199,15 +1197,15 @@ kmp_task_t *__kmp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser))
 #endif
   {
-    KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
 #if OMP_40_ENABLED
     if (parent_task->td_taskgroup)
-      KMP_TEST_THEN_INC32((kmp_int32 *)(&parent_task->td_taskgroup->count));
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
 #endif
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
-      KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
     }
   }
 
@@ -1653,10 +1651,10 @@ static kmp_int32 __kmpc_omp_taskwait_template(ident_t *loc_ref, kmp_int32 gtid,
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
 #endif
     if (must_wait) {
-      kmp_flag_32 flag(
-          RCAST(volatile kmp_uint32 *, &taskdata->td_incomplete_child_tasks),
-          0U);
-      while (TCR_4(taskdata->td_incomplete_child_tasks) != 0) {
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
+                             &(taskdata->td_incomplete_child_tasks)),
+                       0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
                            __kmp_task_stealing_constraint);
@@ -1987,8 +1985,8 @@ void __kmpc_taskgroup(ident_t *loc, int gtid) {
   kmp_taskgroup_t *tg_new =
       (kmp_taskgroup_t *)__kmp_thread_malloc(thread, sizeof(kmp_taskgroup_t));
   KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new));
-  tg_new->count = 0;
-  tg_new->cancel_request = cancel_noreq;
+  KMP_ATOMIC_ST_RLX(&tg_new->count, 0);
+  KMP_ATOMIC_ST_RLX(&tg_new->cancel_request, cancel_noreq);
   tg_new->parent = taskdata->td_taskgroup;
 // TODO: change to OMP_50_ENABLED, need to change build tools for this to work
 #if OMP_45_ENABLED
@@ -2071,8 +2069,9 @@ void __kmpc_end_taskgroup(ident_t *loc, int gtid) {
     if (!taskdata->td_flags.team_serial)
 #endif
     {
-      kmp_flag_32 flag(RCAST(kmp_uint32 *, &taskgroup->count), 0U);
-      while (TCR_4(taskgroup->count) != 0) {
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *, &(taskgroup->count)),
+                       0U);
+      while (KMP_ATOMIC_LD_ACQ(&taskgroup->count) != 0) {
         flag.execute_tasks(thread, gtid, FALSE,
                            &thread_finished USE_ITT_BUILD_ARG(itt_sync_obj),
                            __kmp_task_stealing_constraint);
@@ -2210,7 +2209,7 @@ static kmp_task_t *__kmp_remove_my_task(kmp_info_t *thread, kmp_int32 gtid,
 // task_team thread_data before calling this routine.
 static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
                                     kmp_task_team_t *task_team,
-                                    volatile kmp_int32 *unfinished_threads,
+                                    std::atomic<kmp_int32> *unfinished_threads,
                                     int *thread_finished,
                                     kmp_int32 is_constrained) {
   kmp_task_t *task;
@@ -2348,7 +2347,7 @@ static kmp_task_t *__kmp_steal_task(kmp_info_t *victim_thr, kmp_int32 gtid,
     // master victim) might be prematurely released from the barrier!!!
     kmp_int32 count;
 
-    count = KMP_TEST_THEN_INC32(unfinished_threads);
+    count = KMP_ATOMIC_INC(unfinished_threads);
 
     KA_TRACE(
         20,
@@ -2391,7 +2390,7 @@ static inline int __kmp_execute_tasks_template(
   kmp_task_t *task;
   kmp_info_t *other_thread;
   kmp_taskdata_t *current_task = thread->th.th_current_task;
-  volatile kmp_int32 *unfinished_threads;
+  std::atomic<kmp_int32> *unfinished_threads;
   kmp_int32 nthreads, victim_tid = -2, use_own_tasks = 1, new_victim = 0,
                       tid = thread->th.th_info.ds.ds_tid;
 
@@ -2416,7 +2415,7 @@ static inline int __kmp_execute_tasks_template(
 #else
   KMP_DEBUG_ASSERT(nthreads > 1);
 #endif
-  KMP_DEBUG_ASSERT(TCR_4(*unfinished_threads) >= 0);
+  KMP_DEBUG_ASSERT(*unfinished_threads >= 0);
 
   while (1) { // Outer loop keeps trying to find tasks in case of single thread
     // getting tasks from target constructs
@@ -2546,7 +2545,8 @@ static inline int __kmp_execute_tasks_template(
 #if OMP_45_ENABLED
     // The work queue may be empty but there might be proxy tasks still
     // executing
-    if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0)
+    if (final_spin &&
+        KMP_ATOMIC_LD_ACQ(&current_task->td_incomplete_child_tasks) == 0)
 #else
     if (final_spin)
 #endif
@@ -2557,7 +2557,7 @@ static inline int __kmp_execute_tasks_template(
       if (!*thread_finished) {
         kmp_int32 count;
 
-        count = KMP_TEST_THEN_DEC32(unfinished_threads) - 1;
+        count = KMP_ATOMIC_DEC(unfinished_threads) - 1;
         KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec "
                       "unfinished_threads to %d task_team=%p\n",
                       gtid, count, task_team));
@@ -2984,13 +2984,13 @@ static kmp_task_team_t *__kmp_allocate_task_team(kmp_info_t *thread,
 #endif
   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
 
-  TCW_4(task_team->tt.tt_unfinished_threads, nthreads);
+  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
   TCW_4(task_team->tt.tt_active, TRUE);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
                 "unfinished_threads init'd to %d\n",
                 (thread ? __kmp_gtid_from_thread(thread) : -1), task_team,
-                task_team->tt.tt_unfinished_threads));
+                KMP_ATOMIC_LD_RLX(&task_team->tt.tt_unfinished_threads)));
   return task_team;
 }
 
@@ -3148,7 +3148,8 @@ void __kmp_task_team_setup(kmp_info_t *this_thr, kmp_team_t *team, int always) {
 #if OMP_45_ENABLED
         TCW_4(task_team->tt.tt_found_proxy_tasks, FALSE);
 #endif
-        TCW_4(task_team->tt.tt_unfinished_threads, team->t.t_nproc);
+        KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads,
+                          team->t.t_nproc);
         TCW_4(task_team->tt.tt_active, TRUE);
       }
       // if team size has changed, the first thread to enable tasking will
@@ -3205,9 +3206,9 @@ void __kmp_task_team_wait(
       // Worker threads may have dropped through to release phase, but could
       // still be executing tasks. Wait here for tasks to complete. To avoid
       // memory contention, only master thread checks termination condition.
-      kmp_flag_32 flag(
-          RCAST(volatile kmp_uint32 *, &task_team->tt.tt_unfinished_threads),
-          0U);
+      kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
+                             &task_team->tt.tt_unfinished_threads),
+                       0U);
       flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     // Deactivate the old task team, so that the worker threads will stop
@@ -3238,21 +3239,21 @@ void __kmp_task_team_wait(
 // barrier. It is a full barrier itself, which unfortunately turns regular
 // barriers into double barriers and join barriers into 1 1/2 barriers.
 void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
-  volatile kmp_uint32 *spin = RCAST(
-      volatile kmp_uint32 *,
+  std::atomic<kmp_uint32> *spin = RCAST(
+      std::atomic<kmp_uint32> *,
       &team->t.t_task_team[thread->th.th_task_state]->tt.tt_unfinished_threads);
   int flag = FALSE;
   KMP_DEBUG_ASSERT(__kmp_tasking_mode == tskm_extra_barrier);
 
 #if USE_ITT_BUILD
-  KMP_FSYNC_SPIN_INIT(spin, (kmp_uint32 *)NULL);
+  KMP_FSYNC_SPIN_INIT(spin, NULL);
 #endif /* USE_ITT_BUILD */
   kmp_flag_32 spin_flag(spin, 0U);
   while (!spin_flag.execute_tasks(thread, gtid, TRUE,
                                   &flag USE_ITT_BUILD_ARG(NULL), 0)) {
 #if USE_ITT_BUILD
     // TODO: What about itt_sync_obj??
-    KMP_FSYNC_SPIN_PREPARE(CCAST(kmp_uint32 *, spin));
+    KMP_FSYNC_SPIN_PREPARE(RCAST(void *, spin));
 #endif /* USE_ITT_BUILD */
 
     if (TCR_4(__kmp_global.g.g_done)) {
@@ -3263,7 +3264,7 @@ void __kmp_tasking_barrier(kmp_team_t *team, kmp_info_t *thread, int gtid) {
     KMP_YIELD(TRUE); // GH: We always yield here
   }
 #if USE_ITT_BUILD
-  KMP_FSYNC_SPIN_ACQUIRED(CCAST(kmp_uint32 *, spin));
+  KMP_FSYNC_SPIN_ACQUIRED(RCAST(void *, spin));
 #endif /* USE_ITT_BUILD */
 }
 
@@ -3375,11 +3376,11 @@ static void __kmp_first_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
   taskdata->td_flags.complete = 1; // mark the task as completed
 
   if (taskdata->td_taskgroup)
-    KMP_TEST_THEN_DEC32(&taskdata->td_taskgroup->count);
+    KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
 
   // Create an imaginary children for this task so the bottom half cannot
   // release the task before we have completed the second top half
-  TCI_4(taskdata->td_incomplete_child_tasks);
+  KMP_ATOMIC_INC(&taskdata->td_incomplete_child_tasks);
 }
 
 static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
@@ -3387,11 +3388,11 @@ static void __kmp_second_top_half_finish_proxy(kmp_taskdata_t *taskdata) {
 
   // Predecrement simulated by "- 1" calculation
   children =
-      KMP_TEST_THEN_DEC32(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
+      KMP_ATOMIC_DEC(&taskdata->td_parent->td_incomplete_child_tasks) - 1;
   KMP_DEBUG_ASSERT(children >= 0);
 
   // Remove the imaginary children
-  TCD_4(taskdata->td_incomplete_child_tasks);
+  KMP_ATOMIC_DEC(&taskdata->td_incomplete_child_tasks);
 }
 
 static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
@@ -3404,7 +3405,7 @@ static void __kmp_bottom_half_finish_proxy(kmp_int32 gtid, kmp_task_t *ptask) {
 
   // We need to wait to make sure the top half is finished
   // Spinning here should be ok as this should happen quickly
-  while (TCR_4(taskdata->td_incomplete_child_tasks) > 0)
+  while (KMP_ATOMIC_LD_ACQ(&taskdata->td_incomplete_child_tasks) > 0)
     ;
 
   __kmp_release_deps(gtid, taskdata);
@@ -3539,13 +3540,13 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) {
   // Only need to keep track of child task counts if team parallel and tasking
   // not serialized
   if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser)) {
-    KMP_TEST_THEN_INC32(&parent_task->td_incomplete_child_tasks);
+    KMP_ATOMIC_INC(&parent_task->td_incomplete_child_tasks);
     if (parent_task->td_taskgroup)
-      KMP_TEST_THEN_INC32(&parent_task->td_taskgroup->count);
+      KMP_ATOMIC_INC(&parent_task->td_taskgroup->count);
     // Only need to keep track of allocated child tasks for explicit tasks since
     // implicit not deallocated
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT)
-      KMP_TEST_THEN_INC32(&taskdata->td_parent->td_allocated_child_tasks);
+      KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
   }
 
   KA_TRACE(20,
diff --git a/runtime/src/kmp_threadprivate.cpp b/runtime/src/kmp_threadprivate.cpp
index e64097b55..d1ca42209 100644
--- a/runtime/src/kmp_threadprivate.cpp
+++ b/runtime/src/kmp_threadprivate.cpp
@@ -719,7 +719,7 @@ void __kmp_threadprivate_resize_cache(int newCapacity) {
       // compilers use new method.)
       (void)KMP_COMPARE_AND_STORE_PTR(tp_cache_addr->compiler_cache, old_cache,
                                       my_cache);
-      //TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
+      // TCW_PTR(*(tp_cache_addr->compiler_cache), my_cache);
 
       // If the store doesn't happen here, the compiler's old behavior will
       // inevitably call __kmpc_threadprivate_cache with a new location for the
diff --git a/runtime/src/kmp_wait_release.h b/runtime/src/kmp_wait_release.h
index 7aab3e221..f2b253772 100644
--- a/runtime/src/kmp_wait_release.h
+++ b/runtime/src/kmp_wait_release.h
@@ -46,26 +46,51 @@ enum flag_type {
 /*!
  * Base class for wait/release volatile flag
  */
+template <typename P> class kmp_flag_native {
+  volatile P *loc;
+  flag_type t;
+
+public:
+  typedef P flag_t;
+  kmp_flag_native(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+  volatile P *get() { return loc; }
+  void set(volatile P *new_loc) { loc = new_loc; }
+  flag_type get_type() { return t; }
+  P load() { return *loc; }
+  void store(P val) { *loc = val; }
+};
+
+/*!
+ * Base class for wait/release atomic flag
+ */
 template <typename P> class kmp_flag {
-  volatile P
+  std::atomic<P>
       *loc; /**< Pointer to the flag storage that is modified by another thread
              */
   flag_type t; /**< "Type" of the flag in loc */
 public:
   typedef P flag_t;
-  kmp_flag(volatile P *p, flag_type ft) : loc(p), t(ft) {}
+  kmp_flag(std::atomic<P> *p, flag_type ft) : loc(p), t(ft) {}
   /*!
    * @result the pointer to the actual flag
    */
-  volatile P *get() { return loc; }
+  std::atomic<P> *get() { return loc; }
   /*!
    * @param new_loc in   set loc to point at new_loc
    */
-  void set(volatile P *new_loc) { loc = new_loc; }
+  void set(std::atomic<P> *new_loc) { loc = new_loc; }
   /*!
    * @result the flag_type
    */
   flag_type get_type() { return t; }
+  /*!
+   * @result flag value
+   */
+  P load() { return loc->load(std::memory_order_acquire); }
+  /*!
+   * @param val the new flag value to be stored
+   */
+  void store(P val) { loc->store(val, std::memory_order_release); }
   // Derived classes must provide the following:
   /*
   kmp_info_t * get_waiter(kmp_uint32 i);
@@ -134,7 +159,7 @@ static inline void
 __kmp_wait_template(kmp_info_t *this_thr, C *flag,
                     int final_spin USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
   // NOTE: We may not belong to a team at this point.
-  volatile typename C::flag_t *spin = flag->get();
+  volatile void *spin = flag->get();
   kmp_uint32 spins;
   kmp_uint32 hibernate;
   int th_gtid;
@@ -147,7 +172,7 @@ __kmp_wait_template(kmp_info_t *this_thr, C *flag,
 
   KMP_FSYNC_SPIN_INIT(spin, NULL);
   if (flag->done_check()) {
-    KMP_FSYNC_SPIN_ACQUIRED(CCAST(typename C::flag_t *, spin));
+    KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
     return;
   }
   th_gtid = this_thr->th.th_info.ds.ds_gtid;
@@ -318,7 +343,7 @@ final_spin=FALSE)
       } // if
     } // if
 
-    KMP_FSYNC_SPIN_PREPARE(CCAST(typename C::flag_t *, spin));
+    KMP_FSYNC_SPIN_PREPARE(CCAST(void *, spin));
     if (TCR_4(__kmp_global.g.g_done)) {
       if (__kmp_global.g.g_abort)
         __kmp_abort_thread();
@@ -340,7 +365,7 @@ final_spin=FALSE)
     in_pool = !!TCR_4(this_thr->th.th_in_pool);
     if (in_pool != !!this_thr->th.th_active_in_pool) {
       if (in_pool) { // Recently transferred from team to pool
-        KMP_TEST_THEN_INC32(&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         this_thr->th.th_active_in_pool = TRUE;
         /* Here, we cannot assert that:
            KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) <=
@@ -350,7 +375,7 @@ final_spin=FALSE)
            inc/dec'd asynchronously by the workers. The two can get out of sync
            for brief periods of time.  */
       } else { // Recently transferred from pool to team
-        KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
         KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         this_thr->th.th_active_in_pool = FALSE;
       }
@@ -425,7 +450,7 @@ final_spin=FALSE)
   }
 #endif
 
-  KMP_FSYNC_SPIN_ACQUIRED(CCAST(typename C::flag_t *, spin));
+  KMP_FSYNC_SPIN_ACQUIRED(CCAST(void *, spin));
 }
 
 /* Release any threads specified as waiting on the flag by releasing the flag
@@ -438,12 +463,12 @@ template <class C> static inline void __kmp_release_template(C *flag) {
 #endif
   KF_TRACE(20, ("__kmp_release: T#%d releasing flag(%x)\n", gtid, flag->get()));
   KMP_DEBUG_ASSERT(flag->get());
-  KMP_FSYNC_RELEASING(CCAST(typename C::flag_t *, flag->get()));
+  KMP_FSYNC_RELEASING(flag->get());
 
   flag->internal_release();
 
   KF_TRACE(100, ("__kmp_release: T#%d set new spin=%d\n", gtid, flag->get(),
-                 *(flag->get())));
+                 flag->load()));
 
   if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
     // Only need to check sleep stuff if infinite block time not set.
@@ -497,7 +522,9 @@ template <> struct flag_traits<kmp_uint64> {
   }
 };
 
-template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
+// Basic flag that does not use C11 Atomics
+template <typename FlagType>
+class kmp_basic_flag_native : public kmp_flag_native<FlagType> {
   typedef flag_traits<FlagType> traits_type;
   FlagType checker; /**< Value to compare flag to to check if flag has been
                        released. */
@@ -506,14 +533,14 @@ template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
   kmp_uint32
       num_waiting_threads; /**< Number of threads sleeping on this thread. */
 public:
-  kmp_basic_flag(volatile FlagType *p)
-      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
-  kmp_basic_flag(volatile FlagType *p, kmp_info_t *thr)
-      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+  kmp_basic_flag_native(volatile FlagType *p)
+      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+  kmp_basic_flag_native(volatile FlagType *p, kmp_info_t *thr)
+      : kmp_flag_native<FlagType>(p, traits_type::t), num_waiting_threads(1) {
     waiting_threads[0] = thr;
   }
-  kmp_basic_flag(volatile FlagType *p, FlagType c)
-      : kmp_flag<FlagType>(p, traits_type::t), checker(c),
+  kmp_basic_flag_native(volatile FlagType *p, FlagType c)
+      : kmp_flag_native<FlagType>(p, traits_type::t), checker(c),
         num_waiting_threads(0) {}
   /*!
    * param i in   index into waiting_threads
@@ -594,12 +621,105 @@ template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
   enum barrier_type get_bt() { return bs_last_barrier; }
 };
 
+template <typename FlagType> class kmp_basic_flag : public kmp_flag<FlagType> {
+  typedef flag_traits<FlagType> traits_type;
+  FlagType checker; /**< Value to compare flag to to check if flag has been
+                       released. */
+  kmp_info_t
+      *waiting_threads[1]; /**< Array of threads sleeping on this thread. */
+  kmp_uint32
+      num_waiting_threads; /**< Number of threads sleeping on this thread. */
+public:
+  kmp_basic_flag(std::atomic<FlagType> *p)
+      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(0) {}
+  kmp_basic_flag(std::atomic<FlagType> *p, kmp_info_t *thr)
+      : kmp_flag<FlagType>(p, traits_type::t), num_waiting_threads(1) {
+    waiting_threads[0] = thr;
+  }
+  kmp_basic_flag(std::atomic<FlagType> *p, FlagType c)
+      : kmp_flag<FlagType>(p, traits_type::t), checker(c),
+        num_waiting_threads(0) {}
+  /*!
+   * param i in   index into waiting_threads
+   * @result the thread that is waiting at index i
+   */
+  kmp_info_t *get_waiter(kmp_uint32 i) {
+    KMP_DEBUG_ASSERT(i < num_waiting_threads);
+    return waiting_threads[i];
+  }
+  /*!
+   * @result num_waiting_threads
+   */
+  kmp_uint32 get_num_waiters() { return num_waiting_threads; }
+  /*!
+   * @param thr in   the thread which is now waiting
+   *
+   * Insert a waiting thread at index 0.
+   */
+  void set_waiter(kmp_info_t *thr) {
+    waiting_threads[0] = thr;
+    num_waiting_threads = 1;
+  }
+  /*!
+   * @result true if the flag object has been released.
+   */
+  bool done_check() { return this->load() == checker; }
+  /*!
+   * @param old_loc in   old value of flag
+   * @result true if the flag's old value indicates it was released.
+   */
+  bool done_check_val(FlagType old_loc) { return old_loc == checker; }
+  /*!
+   * @result true if the flag object is not yet released.
+   * Used in __kmp_wait_template like:
+   * @code
+   * while (flag.notdone_check()) { pause(); }
+   * @endcode
+   */
+  bool notdone_check() { return this->load() != checker; }
+  /*!
+   * @result Actual flag value before release was applied.
+   * Trigger all waiting threads to run by modifying flag to release state.
+   */
+  void internal_release() { KMP_ATOMIC_ADD(this->get(), 4); }
+  /*!
+   * @result Actual flag value before sleep bit(s) set.
+   * Notes that there is at least one thread sleeping on the flag by setting
+   * sleep bit(s).
+   */
+  FlagType set_sleeping() {
+    return KMP_ATOMIC_OR(this->get(), KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @result Actual flag value before sleep bit(s) cleared.
+   * Notes that there are no longer threads sleeping on the flag by clearing
+   * sleep bit(s).
+   */
+  FlagType unset_sleeping() {
+    return KMP_ATOMIC_AND(this->get(), ~KMP_BARRIER_SLEEP_STATE);
+  }
+  /*!
+   * @param old_loc in   old value of flag
+   * Test whether there are threads sleeping on the flag's old value in old_loc.
+   */
+  bool is_sleeping_val(FlagType old_loc) {
+    return old_loc & KMP_BARRIER_SLEEP_STATE;
+  }
+  /*!
+   * Test whether there are threads sleeping on the flag.
+   */
+  bool is_sleeping() { return is_sleeping_val(this->load()); }
+  bool is_any_sleeping() { return is_sleeping_val(this->load()); }
+  kmp_uint8 *get_stolen() { return NULL; }
+  enum barrier_type get_bt() { return bs_last_barrier; }
+};
+
 class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
 public:
-  kmp_flag_32(volatile kmp_uint32 *p) : kmp_basic_flag<kmp_uint32>(p) {}
-  kmp_flag_32(volatile kmp_uint32 *p, kmp_info_t *thr)
+  kmp_flag_32(std::atomic<kmp_uint32> *p) : kmp_basic_flag<kmp_uint32>(p) {}
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_info_t *thr)
       : kmp_basic_flag<kmp_uint32>(p, thr) {}
-  kmp_flag_32(volatile kmp_uint32 *p, kmp_uint32 c)
+  kmp_flag_32(std::atomic<kmp_uint32> *p, kmp_uint32 c)
       : kmp_basic_flag<kmp_uint32>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_32(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_32(th_gtid, this); }
@@ -628,13 +748,13 @@ class kmp_flag_32 : public kmp_basic_flag<kmp_uint32> {
   flag_type get_ptr_type() { return flag32; }
 };
 
-class kmp_flag_64 : public kmp_basic_flag<kmp_uint64> {
+class kmp_flag_64 : public kmp_basic_flag_native<kmp_uint64> {
 public:
-  kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag<kmp_uint64>(p) {}
+  kmp_flag_64(volatile kmp_uint64 *p) : kmp_basic_flag_native<kmp_uint64>(p) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_info_t *thr)
-      : kmp_basic_flag<kmp_uint64>(p, thr) {}
+      : kmp_basic_flag_native<kmp_uint64>(p, thr) {}
   kmp_flag_64(volatile kmp_uint64 *p, kmp_uint64 c)
-      : kmp_basic_flag<kmp_uint64>(p, c) {}
+      : kmp_basic_flag_native<kmp_uint64>(p, c) {}
   void suspend(int th_gtid) { __kmp_suspend_64(th_gtid, this); }
   void resume(int th_gtid) { __kmp_resume_64(th_gtid, this); }
   int execute_tasks(kmp_info_t *this_thr, kmp_int32 gtid, int final_spin,
@@ -663,7 +783,7 @@ class kmp_flag_64 : public kmp_basic_flag<kmp_uint64> {
 };
 
 // Hierarchical 64-bit on-core barrier instantiation
-class kmp_flag_oncore : public kmp_flag<kmp_uint64> {
+class kmp_flag_oncore : public kmp_flag_native<kmp_uint64> {
   kmp_uint64 checker;
   kmp_info_t *waiting_threads[1];
   kmp_uint32 num_waiting_threads;
@@ -683,27 +803,17 @@ class kmp_flag_oncore : public kmp_flag<kmp_uint64> {
 
 public:
   kmp_flag_oncore(volatile kmp_uint64 *p)
-      : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
         flag_switch(false) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint32 idx)
-      : kmp_flag<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), num_waiting_threads(0),
         offset(idx), flag_switch(false) {}
   kmp_flag_oncore(volatile kmp_uint64 *p, kmp_uint64 c, kmp_uint32 idx,
-                  enum barrier_type bar_t, kmp_info_t *thr
-#if USE_ITT_BUILD
-                  ,
-                  void *itt
-#endif
-                  )
-      : kmp_flag<kmp_uint64>(p, flag_oncore), checker(c),
+                  enum barrier_type bar_t,
+                  kmp_info_t *thr USE_ITT_BUILD_ARG(void *itt))
+      : kmp_flag_native<kmp_uint64>(p, flag_oncore), checker(c),
         num_waiting_threads(0), offset(idx), flag_switch(false), bt(bar_t),
-        this_thr(thr)
-#if USE_ITT_BUILD
-        ,
-        itt_sync_obj(itt)
-#endif
-  {
-  }
+        this_thr(thr) USE_ITT_BUILD_ARG(itt_sync_obj(itt)) {}
   kmp_info_t *get_waiter(kmp_uint32 i) {
     KMP_DEBUG_ASSERT(i < num_waiting_threads);
     return waiting_threads[i];
@@ -727,16 +837,12 @@ class kmp_flag_oncore : public kmp_flag<kmp_uint64> {
       this_thr->th.th_bar[bt].bb.wait_flag = KMP_BARRIER_SWITCHING;
       kmp_flag_64 flag(&this_thr->th.th_bar[bt].bb.b_go,
                        (kmp_uint64)KMP_BARRIER_STATE_BUMP);
-      __kmp_wait_64(this_thr, &flag, TRUE
-#if USE_ITT_BUILD
-                    ,
-                    itt_sync_obj
-#endif
-                    );
+      __kmp_wait_64(this_thr, &flag, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
     }
     return false;
   }
   void internal_release() {
+    // Other threads can write their own bytes simultaneously.
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
       byteref(get(), offset) = 1;
     } else {
diff --git a/runtime/src/ompd-specific.cpp b/runtime/src/ompd-specific.cpp
index 3cb33f8e6..d9f6f71a6 100644
--- a/runtime/src/ompd-specific.cpp
+++ b/runtime/src/ompd-specific.cpp
@@ -29,38 +29,38 @@ int ompd_rtl_version = 7;
 
 void ompd_init()
 {
-  
-static int ompd_initialized = 0;
+    
+  static int ompd_initialized = 0;
 
-if (ompd_initialized)
-  return;
-  
-/**
- * Calculate member offsets for structs and unions
- */
+  if (ompd_initialized)
+    return;
+    
+  /**
+   * Calculate member offsets for structs and unions
+   */
 
 #define ompd_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); 
-OMPD_FOREACH_ACCESS(ompd_init_access)
+  OMPD_FOREACH_ACCESS(ompd_init_access)
 #undef ompd_init_access
 
-/**
- * Create bit mask for bitfield access
- */
+  /**
+   * Create bit mask for bitfield access
+   */
 
 #define ompd_init_bitfield(t,m) ompd_bitfield__##t##__##m=0; ((t*)(&ompd_bitfield__##t##__##m))->m = 1; 
-OMPD_FOREACH_BITFIELD(ompd_init_bitfield)
+  OMPD_FOREACH_BITFIELD(ompd_init_bitfield)
 #undef ompd_init_bitfield
 
-/**
- * Calculate type size information
- */
+  /**
+   * Calculate type size information
+   */
 
 #define ompd_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); 
-OMPD_FOREACH_ACCESS(ompd_init_sizeof_member)
+  OMPD_FOREACH_ACCESS(ompd_init_sizeof_member)
 #undef ompd_init_sizeof_member
 
 #define ompd_init_sizeof(t) ompd_sizeof__##t = sizeof(t); 
-OMPD_FOREACH_SIZEOF(ompd_init_sizeof)
+  OMPD_FOREACH_SIZEOF(ompd_init_sizeof)
 #undef ompd_init_sizeof
 
   volatile static const char * ompd_my_dll_locations[2] = {"libompd.so",NULL};
@@ -118,6 +118,16 @@ void ompd_bp_task_end ( void ){
      we might want to use a separate object file? */
   asm ("");
 }
+void ompd_bp_thread_begin ( void ){
+  /* naive way of implementing hard to opt-out empty function 
+     we might want to use a separate object file? */
+  asm ("");
+}
+void ompd_bp_thread_end ( void ){
+  /* naive way of implementing hard to opt-out empty function 
+     we might want to use a separate object file? */
+  asm ("");
+}
 
 
 #endif /* OMPD_SUPPORT */
diff --git a/runtime/src/ompd-specific.h b/runtime/src/ompd-specific.h
index dc92c38b7..a0d8bc3ef 100644
--- a/runtime/src/ompd-specific.h
+++ b/runtime/src/ompd-specific.h
@@ -18,6 +18,8 @@ void __attribute__ ((noinline)) ompd_bp_parallel_begin ( void );
 void __attribute__ ((noinline)) ompd_bp_parallel_end ( void );
 void __attribute__ ((noinline)) ompd_bp_task_begin ( void );
 void __attribute__ ((noinline)) ompd_bp_task_end ( void );
+void __attribute__ ((noinline)) ompd_bp_thread_begin ( void );
+void __attribute__ ((noinline)) ompd_bp_thread_end ( void );
 #ifdef  __cplusplus
 } /* extern "C" */
 #endif
@@ -87,8 +89,8 @@ OMPD_ACCESS(ompt_thread_info_t,   wait_id) \
 OMPD_ACCESS(ompt_data_t,   value) \
 OMPD_ACCESS(ompt_data_t,   ptr) \
 \
-OMPD_ACCESS(ompt_frame_t,         exit_frame) \
-OMPD_ACCESS(ompt_frame_t,         enter_frame) \
+OMPD_ACCESS(omp_frame_t,         exit_frame) \
+OMPD_ACCESS(omp_frame_t,         enter_frame) \
 \
 OMPD_ACCESS(ompt_lw_taskteam_t,   parent) \
 OMPD_ACCESS(ompt_lw_taskteam_t,   ompt_team_info) \
@@ -109,6 +111,9 @@ OMPD_BITFIELD(kmp_tasking_flags_t,  freed) \
 OMPD_BITFIELD(kmp_tasking_flags_t,  native) \
 
 
+// TODO: (mr) this is a hack to cast cuda contexts to 64 bit values
+typedef uint64_t ompd_cuda_context_ptr_t;
+
 #define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \
 OMPD_SIZEOF(kmp_info_t) \
 OMPD_SIZEOF(kmp_taskdata_t) \
@@ -121,6 +126,7 @@ OMPD_SIZEOF(__kmp_avail_proc) \
 OMPD_SIZEOF(__kmp_max_nth) \
 OMPD_SIZEOF(__kmp_gtid) \
 OMPD_SIZEOF(__kmp_nth) \
+OMPD_SIZEOF(ompd_cuda_context_ptr_t) \
 
 #endif /* OMPD_SUPPORT */
 #endif
diff --git a/runtime/src/ompt-general.cpp b/runtime/src/ompt-general.cpp
index 80a690078..9de376a47 100644
--- a/runtime/src/ompt-general.cpp
+++ b/runtime/src/ompt-general.cpp
@@ -371,7 +371,7 @@ void ompt_post_init() {
 }
 
 void ompt_fini() {
-  if (ompt_enabled.enabled) {
+  if (ompt_enabled.enabled && ompt_start_tool_result && ompt_start_tool_result->finalize) {
     ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data));
   }
 
@@ -482,7 +482,7 @@ OMPT_API_ROUTINE int ompt_get_parallel_info(int ancestor_level,
                                            team_size);
 }
 
-OMPT_API_ROUTINE omp_state_t ompt_get_state(ompt_wait_id_t *wait_id) {
+OMPT_API_ROUTINE omp_state_t ompt_get_state(omp_wait_id_t *wait_id) {
   omp_state_t thread_state = __ompt_get_state_internal(wait_id);
 
   if (thread_state == omp_state_undefined) {
@@ -502,7 +502,7 @@ OMPT_API_ROUTINE ompt_data_t *ompt_get_thread_data(void) {
 
 OMPT_API_ROUTINE int ompt_get_task_info(int ancestor_level, int *type,
                                         ompt_data_t **task_data,
-                                        ompt_frame_t **task_frame,
+                                        omp_frame_t **task_frame,
                                         ompt_data_t **parallel_data,
                                         int *thread_num) {
   return __ompt_get_task_info_internal(ancestor_level, type, task_data,
diff --git a/runtime/src/ompt-internal.h b/runtime/src/ompt-internal.h
index 6b44a06eb..a8f3bac49 100644
--- a/runtime/src/ompt-internal.h
+++ b/runtime/src/ompt-internal.h
@@ -53,7 +53,7 @@ typedef struct ompt_callbacks_active_s {
       (info->td_flags.merged_if0 ? ompt_task_mergeable : 0x0)
 
 typedef struct {
-  ompt_frame_t frame;
+  omp_frame_t frame;
   ompt_data_t task_data;
   struct kmp_taskdata *scheduling_parent;
   int thread_num;
@@ -81,7 +81,7 @@ typedef struct {
                             implicit-task-end */
   void *return_address; /* stored here on entry of runtime */
   omp_state_t state;
-  ompt_wait_id_t wait_id;
+  omp_wait_id_t wait_id;
   int ompt_task_yielded;
   void *idle_frame;
 } ompt_thread_info_t;
diff --git a/runtime/src/ompt-specific.cpp b/runtime/src/ompt-specific.cpp
index 1ae6e1d33..23d09aa85 100644
--- a/runtime/src/ompt-specific.cpp
+++ b/runtime/src/ompt-specific.cpp
@@ -211,15 +211,15 @@ ompt_data_t *__ompt_get_thread_data_internal() {
 void __ompt_thread_assign_wait_id(void *variable) {
   kmp_info_t *ti = ompt_get_thread();
 
-  ti->th.ompt_thread_info.wait_id = (ompt_wait_id_t)variable;
+  ti->th.ompt_thread_info.wait_id = (omp_wait_id_t)variable;
 }
 
-omp_state_t __ompt_get_state_internal(ompt_wait_id_t *ompt_wait_id) {
+omp_state_t __ompt_get_state_internal(omp_wait_id_t *omp_wait_id) {
   kmp_info_t *ti = ompt_get_thread();
 
   if (ti) {
-    if (ompt_wait_id)
-      *ompt_wait_id = ti->th.ompt_thread_info.wait_id;
+    if (omp_wait_id)
+      *omp_wait_id = ti->th.ompt_thread_info.wait_id;
     return ti->th.ompt_thread_info.state;
   }
   return omp_state_undefined;
@@ -328,7 +328,7 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
 
 int __ompt_get_task_info_internal(int ancestor_level, int *type,
                                   ompt_data_t **task_data,
-                                  ompt_frame_t **task_frame,
+                                  omp_frame_t **task_frame,
                                   ompt_data_t **parallel_data,
                                   int *thread_num) {
   if (__kmp_get_gtid() < 0)
@@ -341,18 +341,23 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
   ompt_task_info_t *info = NULL;
   ompt_team_info_t *team_info = NULL;
   kmp_info_t *thr = ompt_get_thread();
+  int level = ancestor_level;
 
   if (thr) {
     kmp_taskdata_t *taskdata = thr->th.th_current_task;
     if (taskdata == NULL)
       return 0;
-    kmp_team *team = thr->th.th_team;
+    kmp_team *team = thr->th.th_team, *prev_team = NULL;
     if (team == NULL)
       return 0;
     ompt_lw_taskteam_t *lwt = NULL,
-                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team);
+                       *next_lwt = LWT_FROM_TEAM(taskdata->td_team),
+                       *prev_lwt = NULL;
 
     while (ancestor_level > 0) {
+      // needed for thread_num
+      prev_team = team;
+      prev_lwt = lwt;
       // next lightweight team (if any)
       if (lwt)
         lwt = lwt->parent;
@@ -410,7 +415,13 @@ int __ompt_get_task_info_internal(int ancestor_level, int *type,
       *parallel_data = team_info ? &(team_info->parallel_data) : NULL;
     }
     if (thread_num) {
-      *thread_num = __kmp_get_gtid();
+      if (level == 0)
+        *thread_num = __kmp_get_tid();
+      else if (prev_lwt)
+        *thread_num = 0;
+      else
+        *thread_num = prev_team->t.t_master_tid;
+      //        *thread_num = team->t.t_master_tid;
     }
     return info ? 2 : 0;
   }
diff --git a/runtime/src/ompt-specific.h b/runtime/src/ompt-specific.h
index 68233c69d..450f4c459 100644
--- a/runtime/src/ompt-specific.h
+++ b/runtime/src/ompt-specific.h
@@ -47,7 +47,7 @@ int __ompt_get_parallel_info_internal(int ancestor_level,
 
 int __ompt_get_task_info_internal(int ancestor_level, int *type,
                                   ompt_data_t **task_data,
-                                  ompt_frame_t **task_frame,
+                                  omp_frame_t **task_frame,
                                   ompt_data_t **parallel_data, int *thread_num);
 
 ompt_data_t *__ompt_get_thread_data_internal();
diff --git a/runtime/src/z_Linux_util.cpp b/runtime/src/z_Linux_util.cpp
index c087dcc06..fd77e9eb9 100644
--- a/runtime/src/z_Linux_util.cpp
+++ b/runtime/src/z_Linux_util.cpp
@@ -1434,7 +1434,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
 
   KF_TRACE(5, ("__kmp_suspend_template: T#%d set sleep bit for spin(%p)==%x,"
                " was %x\n",
-               th_gtid, flag->get(), *(flag->get()), old_spin));
+               th_gtid, flag->get(), flag->load(), old_spin));
 
   if (flag->done_check_val(old_spin)) {
     old_spin = flag->unset_sleeping();
@@ -1462,7 +1462,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
         th->th.th_active = FALSE;
         if (th->th.th_active_in_pool) {
           th->th.th_active_in_pool = FALSE;
-          KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth);
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
@@ -1518,7 +1518,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
     if (deactivated) {
       th->th.th_active = TRUE;
       if (TCR_4(th->th.th_in_pool)) {
-        KMP_TEST_THEN_INC32(&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         th->th.th_active_in_pool = TRUE;
       }
     }
@@ -1591,7 +1591,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
       KF_TRACE(5, ("__kmp_resume_template: T#%d exiting, thread T#%d already "
                    "awake: flag(%p): "
                    "%u => %u\n",
-                   gtid, target_gtid, flag->get(), old_spin, *flag->get()));
+                   gtid, target_gtid, flag->get(), old_spin, flag->load()));
       status = pthread_mutex_unlock(&th->th.th_suspend_mx.m_mutex);
       KMP_CHECK_SYSFAIL("pthread_mutex_unlock", status);
       return;
@@ -1599,7 +1599,7 @@ static inline void __kmp_resume_template(int target_gtid, C *flag) {
     KF_TRACE(5, ("__kmp_resume_template: T#%d about to wakeup T#%d, reset "
                  "sleep bit for flag's loc(%p): "
                  "%u => %u\n",
-                 gtid, target_gtid, flag->get(), old_spin, *flag->get()));
+                 gtid, target_gtid, flag->get(), old_spin, flag->load()));
   }
   TCW_PTR(th->th.th_sleep_loc, NULL);
 
diff --git a/runtime/src/z_Windows_NT_util.cpp b/runtime/src/z_Windows_NT_util.cpp
index d8ffc7dd6..28ad36e5e 100644
--- a/runtime/src/z_Windows_NT_util.cpp
+++ b/runtime/src/z_Windows_NT_util.cpp
@@ -350,7 +350,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
         th->th.th_active = FALSE;
         if (th->th.th_active_in_pool) {
           th->th.th_active_in_pool = FALSE;
-          KMP_TEST_THEN_DEC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+          KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
           KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0);
         }
         deactivated = TRUE;
@@ -375,7 +375,7 @@ static inline void __kmp_suspend_template(int th_gtid, C *flag) {
     if (deactivated) {
       th->th.th_active = TRUE;
       if (TCR_4(th->th.th_in_pool)) {
-        KMP_TEST_THEN_INC32((kmp_int32 *)&__kmp_thread_pool_active_nth);
+        KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
         th->th.th_active_in_pool = TRUE;
       }
     }
diff --git a/runtime/test/lit.cfg b/runtime/test/lit.cfg
index 229698714..e4561ebc3 100644
--- a/runtime/test/lit.cfg
+++ b/runtime/test/lit.cfg
@@ -120,7 +120,11 @@ if config.has_ompt:
     if config.operating_system == 'Windows':
         # No such environment variable on Windows.
         config.substitutions.append(("%preload-tool", "true ||"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
     elif config.operating_system == 'Darwin':
         config.substitutions.append(("%preload-tool", "env DYLD_INSERT_LIBRARIES=%T/tool.so"))
+        # No such linker flag on Darwin.
+        config.substitutions.append(("%no-as-needed-flag", ""))
     else:
         config.substitutions.append(("%preload-tool", "env LD_PRELOAD=%T/tool.so"))
+        config.substitutions.append(("%no-as-needed-flag", "-Wl,--no-as-needed"))
diff --git a/runtime/test/ompt/callback.h b/runtime/test/ompt/callback.h
index 82fff996e..d62f964a1 100755
--- a/runtime/test/ompt/callback.h
+++ b/runtime/test/ompt/callback.h
@@ -1,6 +1,11 @@
+#ifndef _BSD_SOURCE
 #define _BSD_SOURCE
+#endif
 #define _DEFAULT_SOURCE
 #include <stdio.h>
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
 #include <inttypes.h>
 #include <omp.h>
 #include <ompt.h>
@@ -74,7 +79,7 @@ static ompt_enumerate_mutex_impls_t ompt_enumerate_mutex_impls;
 static void print_ids(int level)
 {
   int task_type, thread_num;
-  ompt_frame_t *frame;
+  omp_frame_t *frame;
   ompt_data_t *task_parallel_data;
   ompt_data_t *task_data;
   int exists_task = ompt_get_task_info(level, &task_type, &task_data, &frame,
@@ -189,7 +194,7 @@ on_ompt_callback_mutex_acquire(
   ompt_mutex_kind_t kind,
   unsigned int hint,
   unsigned int impl,
-  ompt_wait_id_t wait_id,
+  omp_wait_id_t wait_id,
   const void *codeptr_ra)
 {
   switch(kind)
@@ -217,7 +222,7 @@ on_ompt_callback_mutex_acquire(
 static void
 on_ompt_callback_mutex_acquired(
   ompt_mutex_kind_t kind,
-  ompt_wait_id_t wait_id,
+  omp_wait_id_t wait_id,
   const void *codeptr_ra)
 {
   switch(kind)
@@ -245,7 +250,7 @@ on_ompt_callback_mutex_acquired(
 static void
 on_ompt_callback_mutex_released(
   ompt_mutex_kind_t kind,
-  ompt_wait_id_t wait_id,
+  omp_wait_id_t wait_id,
   const void *codeptr_ra)
 {
   switch(kind)
@@ -273,7 +278,7 @@ on_ompt_callback_mutex_released(
 static void
 on_ompt_callback_nest_lock(
     ompt_scope_endpoint_t endpoint,
-    ompt_wait_id_t wait_id,
+    omp_wait_id_t wait_id,
     const void *codeptr_ra)
 {
   switch(endpoint)
@@ -447,7 +452,7 @@ on_ompt_callback_lock_init(
   ompt_mutex_kind_t kind,
   unsigned int hint,
   unsigned int impl,
-  ompt_wait_id_t wait_id,
+  omp_wait_id_t wait_id,
   const void *codeptr_ra)
 {
   switch(kind)
@@ -466,7 +471,7 @@ on_ompt_callback_lock_init(
 static void
 on_ompt_callback_lock_destroy(
   ompt_mutex_kind_t kind,
-  ompt_wait_id_t wait_id,
+  omp_wait_id_t wait_id,
   const void *codeptr_ra)
 {
   switch(kind)
@@ -571,7 +576,7 @@ on_ompt_callback_master(
 static void
 on_ompt_callback_parallel_begin(
   ompt_data_t *encountering_task_data,
-  const ompt_frame_t *encountering_task_frame,
+  const omp_frame_t *encountering_task_frame,
   ompt_data_t* parallel_data,
   uint32_t requested_team_size,
   ompt_invoker_t invoker,
@@ -596,7 +601,7 @@ on_ompt_callback_parallel_end(
 static void
 on_ompt_callback_task_create(
     ompt_data_t *encountering_task_data,
-    const ompt_frame_t *encountering_task_frame,
+    const omp_frame_t *encountering_task_frame,
     ompt_data_t* new_task_data,
     int type,
     int has_dependences,
@@ -678,7 +683,7 @@ on_ompt_callback_control_tool(
   void *arg,
   const void *codeptr_ra)
 {
-  ompt_frame_t* omptTaskFrame;
+  omp_frame_t* omptTaskFrame;
   ompt_get_task_info(0, NULL, (ompt_data_t**) NULL, &omptTaskFrame, NULL, NULL);
   printf("%" PRIu64 ": ompt_event_control_tool: command=%" PRIu64 ", modifier=%" PRIu64 ", arg=%p, codeptr_ra=%p, current_task_frame.exit=%p, current_task_frame.reenter=%p \n", ompt_get_thread_data()->value, command, modifier, arg, codeptr_ra, omptTaskFrame->exit_frame, omptTaskFrame->enter_frame);
   return 0; //success
diff --git a/runtime/test/ompt/loadtool/tool_available/tool_available.c b/runtime/test/ompt/loadtool/tool_available/tool_available.c
index dc00b033d..fbbdadd5c 100644
--- a/runtime/test/ompt/loadtool/tool_available/tool_available.c
+++ b/runtime/test/ompt/loadtool/tool_available/tool_available.c
@@ -7,9 +7,9 @@
 // RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
 // 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
 // 2.1 Link with tool during compilation
-// RUN: %libomp-compile -DCODE -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.2 Link with tool during compilation, but AFTER the runtime
-// RUN: %libomp-compile -DCODE -lomp -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.3 Inject tool via the dynamic loader
 // RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
 
diff --git a/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c b/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
index e25e73740..b0d3f2bcf 100644
--- a/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
+++ b/runtime/test/ompt/loadtool/tool_not_available/tool_not_available.c
@@ -7,9 +7,9 @@
 // RUN: %clang %flags -DTOOL -shared -fPIC %s -o %T/tool.so
 // 2. "introducing a dynamically-linked library that includes the tool’s definition of ompt_start_tool into the application’s address space"
 // 2.1 Link with tool during compilation
-// RUN: %libomp-compile -DCODE -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.2 Link with tool during compilation, but AFTER the runtime
-// RUN: %libomp-compile -DCODE -lomp -Wl,--no-as-needed %T/tool.so && %libomp-run | FileCheck %s
+// RUN: %libomp-compile -DCODE -lomp %no-as-needed-flag %T/tool.so && %libomp-run | FileCheck %s
 // 2.3 Inject tool via the dynamic loader
 // RUN: %libomp-compile -DCODE && %preload-tool %libomp-run | FileCheck %s
 
diff --git a/runtime/test/ompt/misc/api_calls_from_other_thread.cpp b/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
index f14759f27..470d7cdda 100644
--- a/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
+++ b/runtime/test/ompt/misc/api_calls_from_other_thread.cpp
@@ -5,26 +5,54 @@
 #include "callback.h"
 
 void f() {
-    ompt_data_t* tdata = ompt_get_thread_data();
-    uint64_t tvalue = tdata ? tdata->value : 0;
+  ompt_data_t *tdata = ompt_get_thread_data();
+  uint64_t tvalue = tdata ? tdata->value : 0;
 
-    printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue, ompt_get_num_places());
+  printf("%" PRIu64 ": ompt_get_num_places()=%d\n", tvalue,
+         ompt_get_num_places());
 
-    printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue, ompt_get_place_proc_ids(0, 0, NULL));
+  printf("%" PRIu64 ": ompt_get_place_proc_ids()=%d\n", tvalue,
+         ompt_get_place_proc_ids(0, 0, NULL));
 
-    printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue, ompt_get_place_num());
+  printf("%" PRIu64 ": ompt_get_place_num()=%d\n", tvalue,
+         ompt_get_place_num());
 
-    printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue, ompt_get_partition_place_nums(0, NULL));
+  printf("%" PRIu64 ": ompt_get_partition_place_nums()=%d\n", tvalue,
+         ompt_get_partition_place_nums(0, NULL));
 
-    printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
+  printf("%" PRIu64 ": ompt_get_proc_id()=%d\n", tvalue, ompt_get_proc_id());
 
-    printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue, ompt_get_num_procs());
-}
+  printf("%" PRIu64 ": ompt_get_num_procs()=%d\n", tvalue,
+         ompt_get_num_procs());
+
+  ompt_callback_t callback;
+  printf("%" PRIu64 ": ompt_get_callback()=%d\n", tvalue,
+         ompt_get_callback(ompt_callback_thread_begin, &callback));
+
+  printf("%" PRIu64 ": ompt_get_state()=%d\n", tvalue, ompt_get_state(NULL));
+
+  int state = omp_state_undefined;
+  const char *state_name;
+  printf("%" PRIu64 ": ompt_enumerate_states()=%d\n", tvalue,
+         ompt_enumerate_states(state, &state, &state_name));
+
+  int impl = ompt_mutex_impl_unknown;
+  const char *impl_name;
+  printf("%" PRIu64 ": ompt_enumerate_mutex_impls()=%d\n", tvalue,
+         ompt_enumerate_mutex_impls(impl, &impl, &impl_name));
+
+  printf("%" PRIu64 ": ompt_get_thread_data()=%p\n", tvalue,
+         ompt_get_thread_data());
+
+  printf("%" PRIu64 ": ompt_get_parallel_info()=%d\n", tvalue,
+         ompt_get_parallel_info(0, NULL, NULL));
 
+  printf("%" PRIu64 ": ompt_get_task_info()=%d\n", tvalue,
+         ompt_get_task_info(0, NULL, NULL, NULL, NULL, NULL));
+}
 
-int main()
-{
-  #pragma omp parallel num_threads(1)
+int main() {
+#pragma omp parallel num_threads(1)
   {}
 
   std::thread t1(f);
@@ -46,6 +74,19 @@ int main()
 
   // CHECK: {{^}}[[MASTER_ID]]: ompt_get_num_procs()={{[0-9]+}}
 
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_callback()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_state()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_states()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_enumerate_mutex_impls()=1
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_thread_data()=[[NULL]]
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_parallel_info()=0
+
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_get_task_info()=0
 
   return 0;
 }
diff --git a/runtime/test/ompt/misc/interoperability.cpp b/runtime/test/ompt/misc/interoperability.cpp
index 5072f91e1..102e6de5c 100644
--- a/runtime/test/ompt/misc/interoperability.cpp
+++ b/runtime/test/ompt/misc/interoperability.cpp
@@ -3,6 +3,7 @@
 
 #include <iostream>
 #include <thread>
+#include <alloca.h>
 
 #include "callback.h"
 #include "omp.h"
@@ -15,6 +16,9 @@ void f() {
   // runtime isn't initialized yet...)
   omp_get_num_threads();
 
+  // Call alloca() to force availability of frame pointer
+  void *p = alloca(0);
+
   OMPT_SIGNAL(condition);
   // Wait for both initial threads to arrive that will eventually become the
   // master threads in the following parallel region.
diff --git a/runtime/test/ompt/misc/threads.c b/runtime/test/ompt/misc/threads.c
new file mode 100644
index 000000000..4a0fc6f46
--- /dev/null
+++ b/runtime/test/ompt/misc/threads.c
@@ -0,0 +1,34 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+  int x = 0;
+#pragma omp parallel num_threads(4)
+  {
+#pragma omp atomic
+    x++;
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/runtime/test/ompt/misc/threads_nested.c b/runtime/test/ompt/misc/threads_nested.c
new file mode 100644
index 000000000..0d38dcf88
--- /dev/null
+++ b/runtime/test/ompt/misc/threads_nested.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s
+// REQUIRES: ompt
+#include "callback.h"
+#include <omp.h>
+
+int main() {
+
+  int condition = 0;
+  int x = 0;
+  omp_set_nested(1);
+#pragma omp parallel num_threads(2)
+  {
+#pragma omp parallel num_threads(2)
+    {
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+    }
+  }
+
+  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[MASTER_ID]]
+  // CHECK: {{^}}[[WORKER_ID1:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID1]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID1]]
+  // CHECK: {{^}}[[WORKER_ID2:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID2]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID2]]
+  // CHECK: {{^}}[[WORKER_ID3:[0-9]+]]: ompt_event_thread_begin:
+  // CHECK-SAME: thread_type=ompt_thread_worker=2, thread_id=[[WORKER_ID3]]
+  // CHECK: {{^}}[[WORKER_ID3]]: ompt_event_thread_end:
+  // CHECK-SAME: thread_id=[[WORKER_ID3]]
+
+  return 0;
+}
diff --git a/runtime/test/ompt/parallel/nested_thread_num.c b/runtime/test/ompt/parallel/nested_thread_num.c
new file mode 100644
index 000000000..b6f8731e5
--- /dev/null
+++ b/runtime/test/ompt/parallel/nested_thread_num.c
@@ -0,0 +1,357 @@
+// RUN: %libomp-compile-and-run | FileCheck %s
+// RUN: %libomp-compile-and-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// REQUIRES: ompt
+// UNSUPPORTE: gcc-4, gcc-5, gcc-6, gcc-7
+#define TEST_NEED_PRINT_FRAME_FROM_OUTLINED_FN
+#include "callback.h"
+#include <omp.h>
+#include <unistd.h>
+
+int main() {
+  int condition = 0;
+  omp_set_nested(1);
+  print_frame(0);
+
+#pragma omp parallel num_threads(2)
+  {
+    print_frame_from_outlined_fn(1);
+    print_ids(0);
+    print_ids(1);
+    print_frame(0);
+
+// get all implicit task events before starting nested:
+#pragma omp barrier
+
+#pragma omp parallel num_threads(2)
+    {
+      print_frame_from_outlined_fn(1);
+      print_ids(0);
+      print_ids(1);
+      print_ids(2);
+      print_frame(0);
+      OMPT_SIGNAL(condition);
+      OMPT_WAIT(condition, 4);
+#pragma omp barrier
+      print_fuzzy_address(1);
+      print_ids(0);
+    }
+    print_fuzzy_address(2);
+    print_ids(0);
+  }
+  print_fuzzy_address(3);
+
+  return 0;
+}
+// Check if libomp supports the callbacks for this test.
+// CHECK-NOT: {{^}}0: Could not register callback
+
+// CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+
+// make sure initial data pointers are null
+// CHECK-NOT: 0: parallel_data initially not null
+// CHECK-NOT: 0: task_data initially not null
+// CHECK-NOT: 0: thread_data initially not null
+
+// CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// CHECK-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]],
+// CHECK-SAME: parent_task_frame.exit=[[NULL]],
+// CHECK-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]],
+// CHECK-SAME: requested_team_size=2,
+// CHECK-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// CHECK-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+
+// Note that we cannot ensure that the worker threads have already called
+// barrier_end and implicit_task_end before parallel_end!
+
+// CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+
+
+// CHECK: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], 
+// CHECK-SAME: task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+// CHECK: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// THREADS: {{^}}0: NULL_POINTER=[[NULL:.*$]]
+// THREADS: __builtin_frame_address(0)=[[MAIN_REENTER:0x[0-f]+]]
+// THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], 
+// THREADS-SAME: parent_task_frame.exit=[[NULL]],
+// THREADS-SAME: parent_task_frame.reenter=[[MAIN_REENTER]],
+// THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER:[0-9]+]]
+
+// nested parallel masters
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]],
+// THREADS-SAME: team_size=2, thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[MAIN_REENTER]]
+
+// THREADS: __builtin_frame_address(0)=[[REENTER:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit=[[EXIT]],
+// THREADS-SAME: parent_task_frame.reenter=[[REENTER]],
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], 
+// THREADS-SAME: requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=0
+
+// THREADS: __builtin_frame_address({{.}})=[[NESTED_EXIT:0x[0-f]+]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]], 
+// THREADS-SAME: thread_num=0
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]],
+// THREADS-SAME: reenter_frame=[[REENTER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[MAIN_REENTER]]
+
+// THREADS: __builtin_frame_address(0)=[[NESTED_REENTER:0x[0-f]+]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+// explicit barrier
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[BARRIER_RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME:  parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NESTED_REENTER]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[BARRIER_RETURN_ADDRESS]]
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NESTED_EXIT]], reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: exit_frame=[[NULL]], reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]],
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[NESTED_RETURN_ADDRESS]]
+
+// THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[EXIT]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// implicit barrier
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], exit_frame=[[NULL]], 
+// THREADS-SAME: reenter_frame=[[NULL]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]],
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]],
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]], 
+// THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS]]{{[0-f][0-f]}}
+
+// THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
+
+// Worker of first nesting level
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]], team_size=2, 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_begin:
+// THREADS-SAME: parent_task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: parent_task_frame.exit={{0x[0-f]+}},
+// THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}},
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]], requested_team_size=2,
+// THREADS-SAME: codeptr_ra=[[NESTED_RETURN_ADDRESS]]{{[0-f][0-f]}},
+// THREADS-SAME: invoker=[[PARALLEL_INVOKER]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]],
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID:[0-9]+]], team_size=2,
+// THREADS-SAME: thread_num=[[INNER_THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]],
+// THREADS-SAME: thread_num=[[INNER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[PARALLEL_ID]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], 
+// THREADS-SAME: thread_num=[[OUTER_THREADNUM]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[NESTED_IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_parallel_end:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// other nested parallel worker threads
+
+// THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID:[0-9]+]],
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+// THREADS-SAME: thread_num=[[THREADNUM:[0-9]+]]
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 0:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+// THREADS-SAME: thread_num=[[THREADNUM]]
+
+// can't reliably tell which parallel region is the parent...
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id={{[0-9]+}},
+// THREADS-SAME: task_id={{[0-9]+}}
+// THREADS-SAME: thread_num={{[01]}}
+
+// THREADS: {{^}}[[THREAD_ID]]: task level 2:
+// THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[PARENT_TASK_ID]]
+// THREADS-SAME: thread_num=0
+
+// THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin:
+// THREADS-SAME: parallel_id=[[NESTED_PARALLEL_ID]], 
+// THREADS-SAME: task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
+// THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end:
+// THREADS-SAME: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+
diff --git a/runtime/test/ompt/parallel/not_enough_threads.c b/runtime/test/ompt/parallel/not_enough_threads.c
index cdc6b2f77..8a0469af1 100644
--- a/runtime/test/ompt/parallel/not_enough_threads.c
+++ b/runtime/test/ompt/parallel/not_enough_threads.c
@@ -1,11 +1,13 @@
 // RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | FileCheck %s
-// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | %sort-threads | FileCheck --check-prefix=THREADS %s
+// RUN: %libomp-compile && env OMP_THREAD_LIMIT=2 %libomp-run | %sort-threads \
+// RUN:     | FileCheck --check-prefix=THREADS %s
+
 // REQUIRES: ompt
+
 #include "callback.h"
 
-int main()
-{
-  #pragma omp parallel num_threads(4)
+int main() {
+#pragma omp parallel num_threads(4)
   {
     print_ids(0);
     print_ids(1);
@@ -13,64 +15,76 @@ int main()
   print_fuzzy_address(1);
 
   // Check if libomp supports the callbacks for this test.
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_begin'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_thread_end'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_implicit_task'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquire'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_acquired'
-  // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_mutex_released'
-
-
-  // CHECK: 0: NULL_POINTER=[[NULL:.*$]]
+  // CHECK-NOT: {{^}}0: Could not register callback
 
-  // make sure initial data pointers are null
+  // Make sure initial data pointers are null.
   // CHECK-NOT: 0: parallel_data initially not null
   // CHECK-NOT: 0: task_data initially not null
   // CHECK-NOT: 0: thread_data initially not null
 
-  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=0x{{[0-f]+}}, invoker=[[PARALLEL_INVOKER:[0-9]+]]
-
-  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
-
-  // Note that we cannot ensure that the worker threads have already called barrier_end and implicit_task_end before parallel_end!
-
-  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // Only check callback names, arguments are verified in THREADS below.
+  // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin
 
-  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
 
-  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // Note that we cannot ensure that the worker threads have already called
+  // barrier_end and implicit_task_end before parallel_end!
 
-  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]], invoker=[[PARALLEL_INVOKER]]
+  // CHECK-DAG: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_implicit_task_begin
+  // CHECK-DAG: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
 
+  // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
 
   // THREADS: 0: NULL_POINTER=[[NULL:.*$]]
-  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin: parent_task_id=[[PARENT_TASK_ID:[0-9]+]], parent_task_frame.exit=[[NULL]], parent_task_frame.reenter={{0x[0-f]+}}, parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4, codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}, invoker={{[0-9]+}}
+  // THREADS: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_initial=1, thread_id=[[MASTER_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin
+  // THREADS-SAME: parent_task_id=[[PARENT_TASK_ID:[0-9]+]]
+  // THREADS-SAME: parent_task_frame.exit=[[NULL]]
+  // THREADS-SAME: parent_task_frame.reenter={{0x[0-f]+}}
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]], requested_team_size=4
+  // THREADS-SAME: codeptr_ra=[[RETURN_ADDRESS:0x[0-f]+]]{{[0-f][0-f]}}
+
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
 
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[MASTER_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID:[0-9]+]], task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_barrier_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[MASTER_ID]]: ompt_event_parallel_end
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
   // THREADS: {{^}}[[MASTER_ID]]: fuzzy_address={{.*}}[[RETURN_ADDRESS]]
 
-  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
-  // THREADS: {{^}}[[THREAD_ID]]: task level 0: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: task level 1: parallel_id=[[IMPLICIT_PARALLEL_ID]], task_id=[[PARENT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID:[0-9]+]]: ompt_event_thread_begin
+  // THREADS-SAME: thread_type=ompt_thread_worker=2, thread_id=[[THREAD_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[IMPLICIT_TASK_ID:[0-9]+]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 0
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: task level 1
+  // THREADS-SAME: parallel_id=[[IMPLICIT_PARALLEL_ID]]
+  // THREADS-SAME: task_id=[[PARENT_TASK_ID]]
   // THREADS-NOT: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
-  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end: parallel_id={{[0-9]+}}, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_begin
+  // THREADS-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[IMPLICIT_TASK_ID]]
+  // parallel_id is 0 because the region ended in the barrier!
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_barrier_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
+  // THREADS: {{^}}[[THREAD_ID]]: ompt_event_implicit_task_end
+  // THREADS-SAME: parallel_id=0, task_id=[[IMPLICIT_TASK_ID]]
 
   return 0;
 }
diff --git a/www/index.html b/www/index.html
index 56df85586..d8a257159 100644
--- a/www/index.html
+++ b/www/index.html
@@ -49,10 +49,7 @@ <h1>OpenMP&reg;: Support for the OpenMP language</h1>
       </li>
       <li>
         the library that supports offload to target devices (in
-        "offload")
-      </li>
-      <li>
-        the OpenUH test-suite used to validate the OpenMP runtime
+        "libomptarget")
       </li>
     </ul>
   </p>
@@ -60,12 +57,7 @@ <h1>OpenMP&reg;: Support for the OpenMP language</h1>
   <p>Support for the parts of the OpenMP 4.0 (and later) language that are not
   associated with the "target" constructs are contained in the
   "runtime" directory. Support for offloading computation via the
-  "target" directive is in the separate "offload" directory. That
-  builds a library that provides the interfaces for transferring code
-  and data to attached computational devices such as
-  the Intel&reg Xeon Phi&#0153 coprocessor or GPUs.
-  The README.txt in the "offload"
-  directory describes how to build the offload library.
+  "target" directive is in the separate "libomptarget" directory.
   </p>
 
   <p>All of the code here is <a