From c836b8d9e9a3384fbf0dcb2289e6c00691a94317 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 22 Jun 2018 07:52:06 -0700 Subject: [PATCH 01/64] [OMPD] Fixing runtime ompd type references --- runtime/src/ompd-specific.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/src/ompd-specific.h b/runtime/src/ompd-specific.h index d0f32422f..8abf0848d 100644 --- a/runtime/src/ompd-specific.h +++ b/runtime/src/ompd-specific.h @@ -85,8 +85,8 @@ OMPD_ACCESS(ompt_thread_info_t, wait_id) \ OMPD_ACCESS(ompt_data_t, value) \ OMPD_ACCESS(ompt_data_t, ptr) \ \ -OMPD_ACCESS(ompt_frame_t, exit_frame) \ -OMPD_ACCESS(ompt_frame_t, enter_frame) \ +OMPD_ACCESS(omp_frame_t, exit_frame) \ +OMPD_ACCESS(omp_frame_t, enter_frame) \ \ OMPD_ACCESS(ompt_lw_taskteam_t, parent) \ OMPD_ACCESS(ompt_lw_taskteam_t, ompt_team_info) \ From 73fe9dececd1a2b77b03557665076d8fa474e47f Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 22 Jun 2018 07:52:51 -0700 Subject: [PATCH 02/64] [OMPD] add cmake modules necessary to build gdb-wrapper --- libompd/gdb-wrapper/CMakeLists.txt | 4 ++ .../cmake/Modules/FindCudaGDB.cmake | 60 +++++++++++++++++++ .../gdb-wrapper/cmake/Modules/FindGDB.cmake | 60 +++++++++++++++++++ .../cmake/Modules/FindReadline.cmake | 47 +++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake create mode 100644 libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake create mode 100644 libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake diff --git a/libompd/gdb-wrapper/CMakeLists.txt b/libompd/gdb-wrapper/CMakeLists.txt index c3ea2824c..11a4f624c 100644 --- a/libompd/gdb-wrapper/CMakeLists.txt +++ b/libompd/gdb-wrapper/CMakeLists.txt @@ -1,5 +1,9 @@ project (odb) +cmake_minimum_required(VERSION 2.8) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") + set (cppfiles InputOutputManager.cpp ChildProcess.cpp diff --git a/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake b/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake new file mode 100644 index 000000000..14613ae4f --- /dev/null +++ b/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake @@ -0,0 +1,60 @@ +# - Try to find GDB +# +# Once done, this will define: +# CUDA_GDB_FOUND - system has CUDA_GDB +# CUDA_GDB_COMMAND - the command to run +# CUDA_GDB_VERSION - version +# CUDA_GDB_HAS_RETURN_CHILD_RESULT - if the --return-child-result flag is supported +# +# Useful configuration variables you might want to add to your cache: +# CUDA_GDB_ROOT_DIR - A directory prefix to search +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + + +set(CUDA_GDB_ROOT_DIR + "${CUDA_GDB_ROOT_DIR}" + CACHE + PATH + "Directory to start our search in") + +find_program(CUDA_GDB_COMMAND + NAMES + cuda-gdb + HINTS + "${CUDA_GDB_ROOT_DIR}" + PATH_SUFFIXES + bin + libexec) + +if(CUDA_GDB_COMMAND) + execute_process(COMMAND cuda-gdb --version + COMMAND head -n 1 + OUTPUT_VARIABLE CUDA_GDB_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "[^0-9]*([0-9]+[0-9.]*).*" "\\1" CUDA_GDB_VERSION "${CUDA_GDB_VERSION}") +endif() + +# handle the QUIETLY and REQUIRED arguments and set xxx_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CUDA_GDB DEFAULT_MSG CUDA_GDB_COMMAND CUDA_GDB_VERSION) + +if(CUDA_GDB_FOUND) + mark_as_advanced(CUDA_GDB_ROOT_DIR) + if(CUDA_GDB_VERSION VERSION_LESS 6.4) + set(CUDA_GDB_HAS_RETURN_CHILD_RESULT FALSE) + else() + set(CUDA_GDB_HAS_RETURN_CHILD_RESULT TRUE) + endif() +endif() + +mark_as_advanced(CUDA_GDB_COMMAND) diff --git a/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake b/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake new file mode 100644 index 000000000..a5f743da6 --- /dev/null +++ b/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake @@ -0,0 +1,60 @@ +# - Try to find GDB +# +# Once done, this will define: +# GDB_FOUND - system has GDB +# GDB_COMMAND - the command to run +# GDB_VERSION - version +# GDB_HAS_RETURN_CHILD_RESULT - if the --return-child-result flag is supported +# +# Useful configuration variables you might want to add to your cache: +# GDB_ROOT_DIR - A directory prefix to search +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + + +set(GDB_ROOT_DIR + "${GDB_ROOT_DIR}" + CACHE + PATH + "Directory to start our search in") + +find_program(GDB_COMMAND + NAMES + gdb + HINTS + "${GDB_ROOT_DIR}" + PATH_SUFFIXES + bin + libexec) + +if(GDB_COMMAND) + execute_process(COMMAND gdb --version + COMMAND head -n 1 + OUTPUT_VARIABLE GDB_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "[^0-9]*([0-9]+[0-9.]*).*" "\\1" GDB_VERSION "${GDB_VERSION}") +endif() + +# handle the QUIETLY and REQUIRED arguments and set xxx_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GDB DEFAULT_MSG GDB_COMMAND GDB_VERSION) + +if(GDB_FOUND) + mark_as_advanced(GDB_ROOT_DIR) + if(GDB_VERSION VERSION_LESS 6.4) + set(GDB_HAS_RETURN_CHILD_RESULT FALSE) + else() + set(GDB_HAS_RETURN_CHILD_RESULT TRUE) + endif() +endif() + +mark_as_advanced(GDB_COMMAND) diff --git a/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake b/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake new file mode 100644 index 000000000..745cfe583 --- /dev/null +++ b/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake @@ -0,0 +1,47 @@ +# - Try to find readline include dirs and libraries +# +# Usage of this module as follows: +# +# find_package(Readline) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Readline_ROOT_DIR Set this variable to the root installation of +# readline if the module has problems finding the +# proper installation path. +# +# Variables defined by this module: +# +# READLINE_FOUND System has readline, include and lib dirs found +# Readline_INCLUDE_DIR The readline include directories. +# Readline_LIBRARY The readline library. + +find_path(Readline_ROOT_DIR + NAMES include/readline/readline.h +) + +find_path(Readline_INCLUDE_DIR + NAMES readline/readline.h + HINTS ${Readline_ROOT_DIR}/include +) + +find_library(Readline_LIBRARY + NAMES readline + HINTS ${Readline_ROOT_DIR}/lib +) + +if(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY) + set(READLINE_FOUND TRUE) +else(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY) + FIND_LIBRARY(Readline_LIBRARY NAMES readline) + include(FindPackageHandleStandardArgs) + FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG Readline_INCLUDE_DIR Readline_LIBRARY ) + MARK_AS_ADVANCED(Readline_INCLUDE_DIR Readline_LIBRARY) +endif(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY) + +mark_as_advanced( + Readline_ROOT_DIR + Readline_INCLUDE_DIR + Readline_LIBRARY +) From 255356b0b315c9c92ba15aeccda4fed43d587466 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 25 Jun 2018 14:20:25 -0700 Subject: [PATCH 03/64] [OMPD] align libompd and odb w/ new OpenMP spec This commit aligns libompd and gdb-wrapper with a newer OpenMP spec. * Some functionality was desiabled as it relies on information no longer exposed via OMPD. * The OMPD callback functions still have non-standard names (but signatures should be correct now) * Cuda device initialization should work now, but not much other OMPD device funtionality. --- libompd/gdb-wrapper/Callbacks.cpp | 20 +- libompd/gdb-wrapper/Callbacks.h | 6 +- libompd/gdb-wrapper/OMPDCommand.cpp | 90 ++-- libompd/gdb-wrapper/OMPDCommand.h | 34 +- libompd/gdb-wrapper/OMPDContext.h | 2 +- libompd/gdb-wrapper/ompd_typedefs.h | 187 +++++++++ libompd/gdb-wrapper/ompd_typedefs.h.bak | 521 ++++++++++++++++++++++++ libompd/src/TargetValue.cpp | 21 +- libompd/src/omp-debug.cpp | 68 +++- libompd/src/ompd.h | 37 +- libomptarget/CMakeLists.txt | 6 + libomptarget/plugins/cuda/src/rtl.cpp | 17 + runtime/src/ompd-specific.cpp | 46 +-- runtime/src/ompd-specific.h | 4 + 14 files changed, 959 insertions(+), 100 deletions(-) create mode 100644 libompd/gdb-wrapper/ompd_typedefs.h create mode 100644 libompd/gdb-wrapper/ompd_typedefs.h.bak diff --git a/libompd/gdb-wrapper/Callbacks.cpp b/libompd/gdb-wrapper/Callbacks.cpp index e15e7e795..d579bf7e1 100644 --- a/libompd/gdb-wrapper/Callbacks.cpp +++ b/libompd/gdb-wrapper/Callbacks.cpp @@ -40,8 +40,7 @@ void initializeCallbacks(const GdbProcessPtr &proc) cb.dmemory_alloc = CB_dmemory_alloc; cb.dmemory_free = CB_dmemory_free; cb.print_string = CB_print_string; - cb.get_thread_context_for_osthread = CB_thread_context; - cb.get_containing_process_context = CB_process_context; + cb.get_thread_context_for_thread_id = CB_thread_context; cb.tsizeof_prim = CB_tsizeof_prim; cb.tsymbol_addr = CB_tsymbol_addr; cb.read_tmemory = CB_read_tmemory; @@ -78,14 +77,14 @@ ompd_rc_t CB_dmemory_free ( ompd_rc_t CB_thread_context ( ompd_address_space_context_t *context, - ompd_osthread_kind_t kind, + ompd_thread_id_kind_t kind, ompd_size_t sizeof_osthread, const void* osthread, ompd_thread_context_t **tcontext ) { ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle; - if (kind == ompd_osthread_cudalogical) { + if (kind == ompd_thread_id_cudalogical) { *tcontext = ((OMPDContext*)context)->getContextForThread((CudaThread*)osthread); } else { @@ -135,7 +134,12 @@ ompd_rc_t CB_tsizeof_prim( inited=1; init_sizes(); } - memcpy(sizes, prim_sizes, sizeof(prim_sizes[0])*ompd_type_max); + sizes->sizeof_char = prim_sizes[ompd_type_char]; + sizes->sizeof_short = prim_sizes[ompd_type_short]; + sizes->sizeof_int = prim_sizes[ompd_type_int]; + sizes->sizeof_long = prim_sizes[ompd_type_long]; + sizes->sizeof_long_long = prim_sizes[ompd_type_long_long]; + sizes->sizeof_pointer = prim_sizes[ompd_type_pointer]; return ret; } @@ -175,7 +179,7 @@ ompd_rc_t CB_tsymbol_addr( parser.matchAddressValue(gdb->readOutput().c_str(), addr); if (strlen(addr) > 0) - symbol_addr->address = (ompd_taddr_t) strtoull (addr, NULL, 0); + symbol_addr->address = (ompd_addr_t) strtoull (addr, NULL, 0); else if (strlen(addr) == 0) ret = ompd_rc_error; @@ -267,7 +271,7 @@ ompd_rc_t CB_write_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, const void *buffer) { return ompd_rc_unsupported; @@ -277,7 +281,7 @@ ompd_rc_t CB_read_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, void *buffer) { if (!context) diff --git a/libompd/gdb-wrapper/Callbacks.h b/libompd/gdb-wrapper/Callbacks.h index d93c74580..349e30f11 100644 --- a/libompd/gdb-wrapper/Callbacks.h +++ b/libompd/gdb-wrapper/Callbacks.h @@ -48,7 +48,7 @@ ompd_rc_t CB_dmemory_free ( ompd_rc_t CB_thread_context ( ompd_address_space_context_t *context, - ompd_osthread_kind_t kind, + ompd_thread_id_kind_t kind, ompd_size_t sizeof_osthread, const void* osthread, ompd_thread_context_t **tcontext); @@ -71,7 +71,7 @@ ompd_rc_t CB_read_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, const ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, void *buffer ); @@ -79,7 +79,7 @@ ompd_rc_t CB_write_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, const ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, const void *buffer ); diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index b1a82e67f..647ecf109 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -5,14 +5,16 @@ * Author: Ignacio Laguna * Contact: ilaguna@llnl.gov */ -#include +//#include #include "OMPDCommand.h" #include "OMPDContext.h" #include "Callbacks.h" #include "OutputString.h" #include "Debug.h" +#include "omp.h" #include "ompd.h" -#include "ompd_test.h" +//#include "ompd_test.h" +#define ODB_LINUX #include "CudaGdb.h" #include @@ -38,9 +40,9 @@ OMPDCommandFactory::OMPDCommandFactory() // Load OMPD DLL and get a handle #ifdef ODB_LINUX - functions->ompdLibHandle = dlopen("libompd_intel.so", RTLD_LAZY); + functions->ompdLibHandle = dlopen("libompd.so", RTLD_LAZY); #elif defined(ODB_MACOS) - functions->ompdLibHandle = dlopen("libompd_intel.dylib", RTLD_LAZY); + functions->ompdLibHandle = dlopen("libompd.dylib", RTLD_LAZY); #else #error Unsupported platform! #endif @@ -76,7 +78,7 @@ FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) // Initialize OMPD library ompd_callbacks_t *table = getCallbacksTable(); assert(table && "Invalid callbacks table"); - ompd_rc_t ret = functions->ompd_initialize(table); + ompd_rc_t ret = functions->ompd_initialize(0, table); if (ret != ompd_rc_ok) { out << "ERROR: could not initialize OMPD\n"; @@ -191,11 +193,11 @@ void OMPDThreads::execute() const for(auto i: thread_ids) { ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( - addrhandle, ompd_osthread_pthread, sizeof(i.second), + addrhandle, ompd_thread_id_pthread, sizeof(i.second), &(i.second), &thread_handle); if (ret == ompd_rc_ok) { - ompd_state_t state; + ompd_word_t state; ompd_wait_id_t wait_id; ret = functions->ompd_get_state(thread_handle, &state, &wait_id); printf(" %-12u %p 0x%lx\t%i\t%lx\n", @@ -211,23 +213,30 @@ void OMPDThreads::execute() const int omp_cuda_threads = 0; vector cuda_ContextPools; map device_initialized; - map address_spaces; + map address_spaces; for(auto i: cuda.threads) { if (!device_initialized[i.coord.cudaContext]) { + cout << "Cuda device with context " << i.coord.cudaContext << "not initialized as OpenMP device. Trying to initialize\n"; OMPDCudaContextPool* cpool; cpool = new OMPDCudaContextPool(&i); ompd_rc_t result; device_initialized[i.coord.cudaContext] = true; result = functions->ompd_device_initialize( - cpool->getGlobalOmpdContext(), - i.coord.cudaContext, - ompd_device_kind_cuda, + addrhandle, + cpool->getGlobalOmpdContext(), + ompd_device_kind_cuda, + sizeof(i.coord.cudaContext), + &i.coord.cudaContext, &cpool->ompd_device_handle); if (result != ompd_rc_ok) + { + cout << "Could not initalize device with context " << i.coord.cudaContext << ". Probably not a OpenMP device\n"; continue; + } + cout << "Device initialized\n"; address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle; } @@ -235,7 +244,7 @@ void OMPDThreads::execute() const ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( address_spaces[i.coord.cudaContext], - ompd_osthread_cudalogical, + ompd_thread_id_cudalogical, sizeof(i.coord), &i.coord, &thread_handle); @@ -321,7 +330,7 @@ void OMPDCallback::execute() const /*ompd_rc_t CB_read_tmemory ( ompd_context_t *context, - ompd_taddr_t addr, + ompd_addr_t addr, ompd_tword_t bufsize, void *buffer );*/ @@ -333,7 +342,7 @@ void OMPDCallback::execute() const return; } long long temp=0; - ompd_taddr_t addr = (ompd_taddr_t)strtoll(extraArgs[1].c_str(), NULL, 0); + ompd_addr_t addr = (ompd_addr_t)strtoll(extraArgs[1].c_str(), NULL, 0); int cnt = atoi(extraArgs[2].c_str()); ret = CB_read_tmemory( host_contextPool->getGlobalOmpdContext(), NULL, {0,addr}, cnt, &temp); @@ -345,7 +354,7 @@ void OMPDCallback::execute() const /*ompd_rc_t CB_tsymbol_addr ( ompd_context_t *context, const char *symbol_name, - ompd_taddr_t *symbol_addr);*/ + ompd_addr_t *symbol_addr);*/ if (extraArgs[0] == "tsymbol_addr") { @@ -388,6 +397,7 @@ void OMPDApi::execute() const if (extraArgs[0] == "get_threads") { +#if 0 // MARKER_MR: TODO: reimplement this functionality with breakpoints if(extraArgs.size()>1) { hout << "Usage: odb api get_threads" << endl; @@ -406,6 +416,8 @@ void OMPDApi::execute() const sout << "0x" << hex << thread_handle_array[i] << ", "; } sout << endl << ""; +#endif + hout << "The 'odb api threads' command has been temporarily removed for the migration to a new ompd standard\n"; } } @@ -424,7 +436,7 @@ vector odbGetThreadHandles(ompd_address_space_handle_t* a { ompd_thread_handle_t* thread_handle; ret = functions->ompd_get_thread_handle( - addrhandle, ompd_osthread_pthread, sizeof(i.second) ,&(i.second), &thread_handle); + addrhandle, ompd_thread_id_pthread, sizeof(i.second) ,&(i.second), &thread_handle); if (ret!=ompd_rc_ok) continue; thread_handles.push_back(thread_handle); @@ -437,7 +449,7 @@ vector odbGetParallelRegions(OMPDFunctionsPtr functions ompd_rc_t ret; ompd_parallel_handle_t * parallel_handle; vector parallel_handles; - ret = functions->ompd_get_top_parallel_region( + ret = functions->ompd_get_current_parallel_handle( th, ¶llel_handle); while(ret == ompd_rc_ok) { @@ -450,6 +462,10 @@ vector odbGetParallelRegions(OMPDFunctionsPtr functions bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector phs) { + sout << "Checking of parallel IDs has been disabled for upgrade of ompd in branch ompd-devices\n"; + // MARKER_MR: TODO: fix checking of parallel ids + return true; +#if 0 bool res=true; // ompd_rc_t ret; int i=0; @@ -466,10 +482,15 @@ bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector phs) { + sout << "Checking of parallel IDs has been disable for upgrade of ompd in branch ompd-devices\n"; + // MARKER_MR: TODO: fix checking of parallel ids for num threads + return true; +#if 0 bool res=true; // ompd_rc_t ret; int i=0; @@ -486,10 +507,15 @@ bool odbCheckParallelNumThreads(OMPDFunctionsPtr functions, vector ths) { + sout << "Checking of task IDs has been disable for upgrade of ompd in branch ompd-devices\n"; + // MARKER_MR: TODO: fix checking of task ids + return true; +#if 0 bool res=true; // ompd_rc_t ret; int i=0; @@ -506,20 +532,21 @@ bool odbCheckTaskIDs(OMPDFunctionsPtr functions, vector ths if (ompt_res != ompd_res) res=false; } return res; +#endif } vector odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_thread_handle_t* th) { ompd_rc_t ret; - ompd_task_handle_t * task_handle; + ompd_task_handle_t *task_handle; vector task_handles; - ret = functions->ompd_get_top_task_region( + ret = functions->ompd_get_current_task_handle( th, &task_handle); while(ret == ompd_rc_ok) { task_handles.push_back(task_handle); - ret = functions->ompd_get_ancestor_task_region( - task_handle, &task_handle); + ret = functions->ompd_get_generating_task_handle( + task_handle, &task_handle); // MARKER_MR: TODO: is it generating or scheduling task or something different? } return task_handles; } @@ -527,16 +554,25 @@ vector odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_t vector odbGetImplicitTasks(OMPDFunctionsPtr functions, ompd_parallel_handle_t* ph) { // ompd_rc_t ret; - ompd_task_handle_t** task_handles; - int num_tasks; + int num_tasks = evalGdbExpression("call omp_get_num_threads()"); vector return_handles; - /*ret = */functions->ompd_get_implicit_task_in_parallel( + + for (int i=0; i < num_tasks; ++i) { + ompd_task_handle_t* task_handle; + functions->ompd_get_task_in_parallel( + ph, i, &task_handle); + return_handles.push_back(task_handle); + } +#if 0 + ompd_task_handle_t* task_handles; + /*ret = */functions->ompd_get_task_in_parallel( ph, &task_handles, &num_tasks); for(int i=0; iompd_get_task_id( ith, &tid); - sout << "0x" << hex << ith << " (" << tid << "), "; +#endif + sout << "0x" << hex << ith << " (" << "DISABLED IN ompd-devices" << "), "; functions->ompd_release_task_handle(ith); } sout << endl; @@ -587,7 +625,7 @@ void OMPDTest::execute() const } sout << endl; pthread_t osthread; - functions->ompd_get_osthread(thr_h, ompd_osthread_pthread, sizeof(pthread_t), &osthread); + functions->ompd_get_thread_id(thr_h, ompd_thread_id_pthread, sizeof(pthread_t), &osthread); host_contextPool->getThreadContext(&osthread)->setThisGdbContext(); odbCheckParallelIDs(functions, parallel_h); odbCheckTaskIDs(functions, task_h); diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index 04e8bf912..b3c671031 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -30,7 +30,7 @@ #include #include "ompd.h" #include "ompd_typedefs.h" -#include "ompd_test.h" +//#include "ompd_test.h" /* @@ -57,31 +57,37 @@ macro(ompd_device_initialize) \ macro(ompd_release_address_space_handle) \ macro(ompd_initialize) \ macro(ompd_finalize) \ -macro(ompd_get_threads) \ +/*macro(ompd_get_threads) */\ macro(ompd_get_thread_in_parallel) \ macro(ompd_release_thread_handle) \ macro(ompd_thread_handle_compare) \ -macro(ompd_get_top_parallel_region) \ +macro(ompd_get_thread_id) \ +/*macro(ompd_get_top_parallel_region)*/ \ +macro(ompd_get_current_parallel_handle) \ macro(ompd_get_enclosing_parallel_handle) \ -macro(ompd_get_task_enclosing_parallel_handle) \ +/*macro(ompd_get_task_enclosing_parallel_handle) */\ macro(ompd_release_parallel_handle) \ macro(ompd_parallel_handle_compare) \ -macro(ompd_get_top_task_region) \ +/*macro(ompd_get_top_task_region) \ macro(ompd_get_ancestor_task_region) \ -macro(ompd_get_implicit_task_in_parallel) \ +macro(ompd_get_implicit_task_in_parallel) */\ +macro(ompd_get_current_task_handle) \ +macro(ompd_get_generating_task_handle) \ +/*macro(ompd_get_scheduling_task_handle)*/ \ +macro(ompd_get_task_in_parallel) \ macro(ompd_release_task_handle) \ macro(ompd_task_handle_compare) \ -macro(ompd_get_num_procs) \ +/*macro(ompd_get_num_procs) \ macro(ompd_get_thread_limit) \ macro(ompd_get_num_threads) \ macro(ompd_get_level) \ macro(ompd_get_active_level) \ macro(ompd_get_parallel_id) \ -macro(ompd_get_parallel_function) \ +macro(ompd_get_parallel_function) */\ macro(ompd_get_thread_handle) \ -macro(ompd_get_osthread) \ +/*macro(ompd_get_osthread)*/ \ macro(ompd_get_state) \ -macro(ompd_get_max_threads) \ +/*macro(ompd_get_max_threads) \ macro(ompd_get_thread_num) \ macro(ompd_in_parallel) \ macro(ompd_in_final) \ @@ -89,11 +95,11 @@ macro(ompd_get_dynamic) \ macro(ompd_get_nested) \ macro(ompd_get_max_active_levels) \ macro(ompd_get_schedule) \ -macro(ompd_get_proc_bind) \ +macro(ompd_get_proc_bind)*/ \ macro(ompd_get_task_frame) \ -macro(ompd_get_task_id) \ -macro(ompd_get_version) \ -macro(ompd_get_version_string) \ +/*macro(ompd_get_task_id) */\ +macro(ompd_get_api_version) \ +/*macro(ompd_get_version_string) \*/ namespace ompd_gdb { diff --git a/libompd/gdb-wrapper/OMPDContext.h b/libompd/gdb-wrapper/OMPDContext.h index be3142439..89793543e 100644 --- a/libompd/gdb-wrapper/OMPDContext.h +++ b/libompd/gdb-wrapper/OMPDContext.h @@ -16,7 +16,7 @@ */ #include "ompd.h" -#include "ompd_test.h" +//#include "ompd_test.h" #include "GdbProcess.h" #include "Callbacks.h" #include "CudaGdb.h" diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h new file mode 100644 index 000000000..39fe07a8d --- /dev/null +++ b/libompd/gdb-wrapper/ompd_typedefs.h @@ -0,0 +1,187 @@ +#include "ompd.h" + +/* this should be somewhere else*/ +typedef uint64_t omp_device_t; +typedef ompd_thread_id_kind_t ompd_thread_id_t; + +/* 4.3.4.1 + * Global initialization and finalization + */ + +typedef ompd_rc_t (*ompd_initialize_fn_t) ( + ompd_word_t api_version, + const ompd_callbacks_t *callbacks +); + +typedef ompd_rc_t (*ompd_get_api_version_fn_t) ( + ompd_word_t *version +); + +typedef ompd_rc_t (*ompd_get_version_string_fn_t) ( + const char **string +); + +typedef ompd_rc_t (*ompd_finalize_fn_t) (void); + +/* 4.3.4.2 + * Per OpenMP Process Initialiyation and Finalization + */ + +typedef ompd_rc_t (*ompd_process_initialize_fn_t) ( + ompd_address_space_context_t *context, + ompd_address_space_handle_t **handle + ); + +typedef ompd_rc_t (*ompd_device_initialize_fn_t) ( + ompd_address_space_handle_t *process_handle, /*IN: address space of the OpenMP process*/ + ompd_address_space_context_t *device_context, /*IN: Opaque tool handle for device address space*/ + omp_device_t kind, /*IN: device identifier kind*/ + ompd_size_t sizeof_id, /*IN: size of device identifier*/ + void *id, /*IN: device identifier*/ + ompd_address_space_handle_t **device_handle /*OUT: device handle*/ + ); + + +typedef ompd_rc_t (*ompd_release_address_space_handle_fn_t) ( + ompd_address_space_handle_t *addr_handle /* IN: handle for the address space */ + ); + +/* 4.3.4.4 + * Address space information + */ + +typedef ompd_rc_t (*ompd_get_omp_version_fn_t) ( + ompd_address_space_handle_t *address_space, + ompd_word_t *omp_version + ); + +typedef ompd_rc_t (*ompd_get_omp_version_string_fn_t) ( + ompd_address_space_handle_t *address_space, + const char **string + ); + +/* 4.3.4.5 + * Thread Handles + */ + +typedef ompd_rc_t (*ompd_get_thread_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /*IN: handle for the parallel region*/ + int thread_num, /*IN: the nubmer of the thread that is returned*/ + ompd_thread_handle_t **thread_hanlde /*OUT: returned thread handle*/ + ); + + +typedef ompd_rc_t (*ompd_get_thread_handle_fn_t) ( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_t kind, + ompd_size_t sizeof_osthread, + const void* osthread, + ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ + ); + +typedef ompd_rc_t (*ompd_release_thread_handle_fn_t) ( + ompd_thread_handle_t *thread_handle +); + +typedef ompd_rc_t (*ompd_thread_handle_compare_fn_t) ( + ompd_thread_handle_t *thread_handle_1, + ompd_thread_handle_t *thread_handle_2, + int *cmp_value +); + +typedef ompd_rc_t (*ompd_get_thread_id_fn_t) ( + ompd_thread_handle_t *thread_handle, + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, + void *thread_id + ); + +/* 4.3.4.6 + * Parallel Region Handles + */ + +typedef ompd_rc_t (*ompd_get_current_parallel_handle_fn_t) ( + ompd_thread_handle_t *thread_handle, + ompd_parallel_handle_t **parallel_handle + ); + +typedef ompd_rc_t (*ompd_get_enclosing_parallel_handle_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ); + +typedef ompd_rc_t (*ompd_get_task_parallel_handle_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_parallel_handle_t **task_parallel_handle + ); + +typedef ompd_rc_t (*ompd_release_parallel_handle_fn_t) ( + ompd_parallel_handle_t *parallel_handle + ); + +typedef ompd_rc_t (*ompd_parallel_handle_compare_fn_t) ( + ompd_parallel_handle_t *parallel_handle_1, + ompd_parallel_handle_t *parallel_handle_2, + int *cmp_value + ); + +/* 4.3.4.7 + * Task Handles + */ + +typedef ompd_rc_t (*ompd_get_current_task_handle_fn_t) ( + ompd_thread_handle_t *thread_handle, + ompd_task_handle_t **task_handle + ); + +typedef ompd_rc_t (*ompd_get_generating_task_handle_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_task_handle_t **generating_task_handle + ); + +typedef ompd_rc_t (*ompd_get_scheduling_task_handle_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_task_handle_t **scheduling_task_handle + ); + +typedef ompd_rc_t (*ompd_get_task_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, + int thread_num, + ompd_task_handle_t **task_handle + ); + +typedef ompd_rc_t (*ompd_release_task_handle_fn_t) ( + ompd_task_handle_t *task_handle +); + +typedef ompd_rc_t (*ompd_task_handle_compare_fn_t) ( + ompd_task_handle_t *task_handle_1, + ompd_task_handle_t *task_handle_2, + int *cmp_value +); + +typedef ompd_rc_t (*ompd_get_task_function_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_address_t *entry_point + ); + +typedef ompd_rc_t (*ompd_get_task_frame_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_address_t *exit_frame, + ompd_address_t *enter_frame + ); + +typedef ompd_rc_t (*ompd_enumerate_states_fn_t) ( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, + ompd_word_t *next_state, + const char **next_state_name, + ompd_word_t *more_enums + ); + +typedef ompd_rc_t (*ompd_get_state_fn_t) ( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *state, /* OUT: State of this thread */ + ompd_wait_id_t *wait_id /* OUT: Wait ID */ + ); + diff --git a/libompd/gdb-wrapper/ompd_typedefs.h.bak b/libompd/gdb-wrapper/ompd_typedefs.h.bak new file mode 100644 index 000000000..2bad5c82b --- /dev/null +++ b/libompd/gdb-wrapper/ompd_typedefs.h.bak @@ -0,0 +1,521 @@ +/* + * ompd.h + * + * Created on: Dec 22, 2014 + * Author: Ignacio Laguna + * Joachim Protze + * Contact: ilaguna@llnl.gov + * protze@llnl.gov + */ +/****************************************************************************** + * This header file defines the OMPD interface: an interface to help debuggers + * to inspect state associated with OpenMP programming abstractions in a target + * process. The interface is implemented in a dynamically loaded library (DLL) + * that the debugger loads into its address space. + * + * Name conventions: + * - All named entities start with the prefix "ompd_" (for OpenMP debugging) + * - Type entities end with the suffix "_t" (for type) + * - Function types end with the suffix "_fn_t" (for function type) + * - Return code entities have "_rc_" in it + * - Abstractions referring to the target have the prefix "t" (e.g., + * "tmemory" for memory in the target, or "tsymbol" for symbol in the target) + * - Abstractions referring to the debugger have the prefix "d" (e.g., + * "dmemory" for memory in the debugger) + * + * Comment conventions: + * - Input function parameters denoted by "IN:" + * - Output function parameters denoted by "OUT:" + */ +/****************************************************************************** + * General types and data structures + */ +/** + * Basic types. + */ +/** + * The following definitions match with ptx information stored in DWARF + */ +/* + * Definition of OMPD states, taken from OMPT + */ +/** + * Context handle. + * This is used by the debugger to identify a target process (or core file). + * This will be cast to concrete types within the debugger. The callbacks use + * context handles to specify the debugger where to look up (since the debugger + * can be handling different contexts at the same time, e.g., processes and/or + * core files). Without context handles the debugger would not know the target + * of a callback request. + */ +/** + * OpenMP abstractions handles. + * Each operation in the OMPD interface must explicitly specify a handle for the + * context of the operation. OMPD uses context handles for OpenMP entities, such + * as threads, parallel regions, and tasks. A handle for an entity is constant + * while the entity itself is live. + */ +/** + * Other handles. + */ +/** + * Logical coordinates of OMP target device threads + */ +/** + * Return codes. + * Each OMPD operation returns a code. + */ +/** + * Primitive types. + */ +/** + * Primitive type sizes. + * These types are used by OMPD to interrogate the debugger about the size of + * primitive types in the target. + */ +/****************************************************************************** + * Debugger callback signatures. + * These callback function signatures are used by OMPD to obtain state + * information of a target process, in particular to interrogate about info + * that is dependent on a particular OpenMP runtime library. Typical queries are + * sizes of primitive types in the target, symbols lookup, lookup of offsets of + * fields in a type/structure, and read/write to memory in the target. + */ +/** + * Allocate memory in the debugger's address space. + */ +/** + * Free memory in the debugger's address space. + */ +/** + * Get thread specific context. + */ +/** + * Get containing (host) process context for address_space_context + */ +/** + * Look up the sizes of primitive types in the target + */ +/** + * Look up the address of a global symbol in the target + */ +/** + * Read memory from the target + */ +/** + * Write memory from the target + */ +/** + * This is used by the OMPD library to have the debugger print a string. + * The OMPD should not print directly. + */ +/** + * Callbacks table. + */ +/****************************************************************************** + * Call signatures from the debugger to the OMPD DLL. + */ +/* --- 4 Initialization ----------------------------------------------------- */ + +/** + * The OMPD function ompd_get_version_string returns a descriptive string + * describing an implementation of the OMPD library. The function + * ompd_get_version_compatibility returns an integer code used to indicate the + * revision of the OMPD specification supported by an implementation of OMPD. + */ +typedef ompd_rc_t (*ompd_get_version_fn_t) ( + int *version + ); +typedef ompd_rc_t (*ompd_get_version_string_fn_t) ( + const char **string /* OUT: OMPD version string */ + ); +/** + * Initialize OMPD. + * This provides the DLL the pointers to the debugger's functions to obtain + * information about the OpenMP runtime library. The debugger promises to + * maintain the functions valid for as long as needed. + */ +typedef ompd_rc_t (*ompd_initialize_fn_t) ( + const ompd_callbacks_t *table /* IN: callbacks table */ + ); +typedef ompd_rc_t (*ompd_process_initialize_fn_t) ( + ompd_address_space_context_t *context, /* IN: debugger handle for the target */ + ompd_address_space_handle_t **addrhandle /* OUT: ompd handle for the target */ + ); +typedef ompd_rc_t (*ompd_release_address_space_handle_fn_t) ( + ompd_address_space_handle_t *addr_handle /* IN: handle for the address space */ + ); +typedef ompd_rc_t (*ompd_device_initialize_fn_t) ( + ompd_address_space_context_t *context, /* IN: debugger handle for the device */ + ompd_device_identifier_t id, /* IN: object defined by native device API */ + ompd_device_kind_t kind, /* IN: */ + ompd_address_space_handle_t **addrhandle /* OUT: ompd handle for the device */ + ); +typedef ompd_rc_t (*ompd_finalize_fn_t) ( void ); + +/* --- 4 Handle Management -------------------------------------------------- */ + +/* --- 4.1 Thread Handles --------------------------------------------------- */ + +/** + * Retrieve handles for all OpenMP threads. + * + * The ompd_get_threads operation enables the debugger to obtain handles for all + * OpenMP threads. A successful invocation of ompd_get_threads returns a pointer + * to a vector of handles in thread_handle_array and returns the number of + * handles in num_handles. This call yields meaningful results only if all + * OpenMP threads are stopped; otherwise, the OpenMP runtime may be creating + * and/or destroying threads during or after the call, rendering useless the + * vector of handles returned. + */ +typedef ompd_rc_t (*ompd_get_threads_fn_t) ( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */ + int *num_handles /* OUT: number of handles in the array */ + ); +/** + * Retrieve handles for OpenMP threads in a parallel region. + * + * The ompd_get_thread_in_parallel operation enables the debugger to obtain + * handles for all OpenMP threads associated with a parallel region. A + * successful invocation of ompd_get_thread_in_parallel returns a pointer to a + * vector of handles in thread_handle_array and returns the number of handles in + * num_handles. This call yields meaningful results only if all OpenMP threads + * in the parallel region are stopped; otherwise, the OpenMP runtime may be + * creating and/or destroying threads during or after the call, rendering + * useless the vector of handles returned. + */ +typedef ompd_rc_t (*ompd_get_thread_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN */ + ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */ + int *num_handles /* OUT: number of handles in the array */ + ); +typedef ompd_rc_t (*ompd_get_master_thread_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN */ + ompd_thread_handle_t **thread_handle); +typedef ompd_rc_t (*ompd_release_thread_handle_fn_t) ( + ompd_thread_handle_t *thread_handle +); +typedef ompd_rc_t (*ompd_thread_handle_compare_fn_t) ( + ompd_thread_handle_t *thread_handle_1, + ompd_thread_handle_t *thread_handle_2, + int *cmp_value +); +typedef ompd_rc_t (*ompd_get_thread_handle_string_id_fn_t) ( + ompd_thread_handle_t *thread_handle, + char **string_id +); +/* --- 4.2 Parallel Region Handles------------------------------------------- */ + +/** + * Retrieve the handle for the innermost patallel region for an OpenMP thread. + * + * The operation ompd_get_top_parallel_region enables the debugger to obtain + * the handle for the innermost parallel region associated with an OpenMP + * thread. This call is meaningful only if the thread whose handle is provided + * is stopped. + */ +typedef ompd_rc_t (*ompd_get_top_parallel_region_fn_t) ( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_parallel_handle_t **parallel_handle /* OUT: OpenMP parallel handle */ + ); +/** + * Retrieve the handle for an enclosing parallel region. + * + * The ompd_get_enclosing_parallel_handle operation enables the debugger to + * obtain the handle for the parallel region enclosing the parallel region + * specified by parallel_handle. This call is meaningful only if at least one + * thread in the parallel region is stopped. + */ +typedef ompd_rc_t (*ompd_get_enclosing_parallel_handle_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ); +/** + * Retrieve the handle for the enclosing parallel region or a task region. + * + * The ompd_get_task_enclosing_parallel_handle operation enables the debugger to + * obtain the handle for the parallel region enclosing the task region + * specified by task_handle. This call is meaningful only if at least one + * thread in the parallel region is stopped. + */ +typedef ompd_rc_t (*ompd_get_task_enclosing_parallel_handle_fn_t) ( + ompd_task_handle_t* task_handle, /* IN: OpenMP task handle */ + ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ); +typedef ompd_rc_t (*ompd_release_parallel_handle_fn_t) ( + ompd_parallel_handle_t *parallel_handle +); +typedef ompd_rc_t (*ompd_parallel_handle_compare_fn_t) ( + ompd_parallel_handle_t *parallel_handle_1, + ompd_parallel_handle_t *parallel_handle_2, + int *cmp_value +); +typedef ompd_rc_t (*ompd_get_parallel_handle_string_id_fn_t) ( + ompd_parallel_handle_t *parallel_handle, + char **string_id +); +/* --- 4.3 Task Handles ----------------------------------------------------- */ + +/** + * Retrieve the handle for the innermost task for an OpenMP thread. + * + * The debugger uses the operation ompd_get_top_task_region to obtain the handle + * for the innermost task region associated with an OpenMP thread. This call is + * meaningful only if the thread whose handle is provided is stopped. + */ +typedef ompd_rc_t (*ompd_get_top_task_region_fn_t) ( + ompd_thread_handle_t* thread_handle, /* IN: OpenMP thread handle*/ + ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ + ); +/** + * Retrieve the handle for an enclosing task. + * + * The debugger uses ompd_get_ancestor_task_region to obtain the handle for the + * task region enclosing the task region specified by task_handle. This call is + * meaningful only if the thread executing the task specified by task_handle is + * stopped. + */ +typedef ompd_rc_t (*ompd_get_ancestor_task_region_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ); +typedef ompd_rc_t (*ompd_get_generating_ancestor_task_region_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ); +typedef ompd_rc_t (*ompd_get_scheduling_ancestor_task_region_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ); +/** + * Retrieve implicit task handle for a parallel region. + * + * The ompd_get_implicit_task_in_parallel operation enables the debugger to + * obtain handles for implicit tasks associated with a parallel region. This + * call is meaningful only if all threads associated with the parallel region + * are stopped. + */ +typedef ompd_rc_t (*ompd_get_implicit_task_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_task_handle_t ***task_handle_array, /* OUT: array of OpenMP task handles */ + int *num_handles /* OUT: number of task handles */ + ); +typedef ompd_rc_t (*ompd_release_task_handle_fn_t) ( + ompd_task_handle_t *task_handle +); +typedef ompd_rc_t (*ompd_task_handle_compare_fn_t) ( + ompd_task_handle_t *task_handle_1, + ompd_task_handle_t *task_handle_2, + int *cmp_value +); +typedef ompd_rc_t (*ompd_get_task_handle_string_id_fn_t) ( + ompd_task_handle_t *task_handle, + char **string_id +); +/* --- 5o Process and Thread Settings ---------------------------------------- */ + +/** + * The functions ompd_get_num_procs and ompd_get_thread_limit are third-party + * versions of the OpenMP runtime functions omp_get_num_procs and + * omp_get_thread_limit. + */ +typedef ompd_rc_t (*ompd_get_num_procs_fn_t) ( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: number of processes */ + ); +typedef ompd_rc_t (*ompd_get_thread_limit_fn_t) ( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: max number of threads */ + ); + /* --- 6 Parallel Region Inqueries ------------------------------------------ */ + /* --- 6.1 Settings --------------------------------------------------------- */ + /** + * Determine the number of threads associated with a parallel region. + */ + typedef ompd_rc_t (*ompd_get_num_threads_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: number of threads */ + ); + /** + * Determine the nesting depth of a particular parallel region instance. + */ + typedef ompd_rc_t (*ompd_get_level_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: nesting level */ + ); + /** + * Determine the number of enclosing active parallel regions. + * + * ompd_get_active_level returns the number of nested, active parallel regions + * enclosing the parallel region specified by its handle. + */ + typedef ompd_rc_t (*ompd_get_active_level_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: active nesting level */ + ); + /* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ + + /* --- 7 Thread Inquiry ----------------------------------------------------- */ + /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ + /** + * Obtain an OpenMP thread handle and the internal OS thread handle for the + * selected (context) thread. + * If the function returns ompd_rc_ok then the operating system thread + * corresponds to an OpenMP thread and the thread_handle is initialized. The + * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime + * system. + */ + typedef ompd_rc_t (*ompd_get_thread_handle_fn_t) ( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_kind_t kind, + ompd_size_t sizeof_osthread, + const void* osthread, + ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ + ); + /** + * Obtain the OS thread handle for an OpenMP thread handle. + * this might change over time in case virtual openmp threads migrate between + * OS threads. + */ + typedef ompd_rc_t (*ompd_get_osthread_fn_t) ( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_thread_id_kind_t kind, + ompd_size_t sizeof_osthread, + void *osthread + ); + typedef ompd_rc_t (*ompd_get_thread_num_fn_t) ( + ompd_thread_handle_t* thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *val /* OUT: number of the thread within the team */ + ); + /* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */ + + /** + * Get the state of a thread. This can use OMPT state data structure to define + * different states of threads (e.g., idle, working, or barrier, etc) and what + * entity cased this state (e.g., address of a lock); + * + * The function ompd_get_state is a third-party version of ompt_get_state. The + * only difference between the OMPD and OMPT counterparts is that the OMPD + * version must supply a thread handle to provide a context for this inquiry. + */ + typedef ompd_rc_t (*ompd_get_state_fn_t) ( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *state, /* OUT: State of this thread */ + ompd_wait_id_t *wait_id /* OUT: Wait ID */ + ); + /* --- 8 Task Inquiry ------------------------------------------------------- */ + + /* --- 8.1 Task Function Entry Point ---------------------------------------- */ + + /** + * The ompd_get_task_function returns the entry point of the code that + * corresponds to the body of code executed by the task. + */ + typedef ompd_rc_t (*ompd_get_task_function_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_address_t *entry_point /* OUT: first instruction in the task region */ + ); + /* --- 8.2 Task Settings ---------------------------------------------------- */ + + /** + * Retrieve information from OpenMP tasks. These inquiry functions have no + * counterparts in the OMPT interface as a first-party tool can call OpenMP + * runtime inquiry functions directly. The only difference between the OMPD + * inquiry operations and their counterparts in the OpenMP runtime is that the + * OMPD version must supply a task handle to provide a context for each inquiry. + */ + typedef ompd_rc_t (*ompd_get_max_threads_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ); + typedef ompd_rc_t (*ompd_in_parallel_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: Is OpenMP in parallel? */ + ); + typedef ompd_rc_t (*ompd_in_final_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: Is OpenMP in final? */ + ); + typedef ompd_rc_t (*ompd_get_dynamic_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: ? */ + ); + typedef ompd_rc_t (*ompd_get_nested_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_word_t *val /* OUT: Is this task nested? */ + ); + typedef ompd_rc_t (*ompd_get_max_active_levels_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_word_t *val /* OUT: max active levels */ + ); +#if 0 + typedef ompd_rc_t (*ompd_get_schedule_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_sched_t *kind, /* OUT: Kind of OpenMP schedule*/ + ompd_word_t *modifier /* OUT: Schedunling modifier */ + ); +#endif + typedef ompd_rc_t (*ompd_get_proc_bind_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_proc_bind_t *bind /* OUT: Kind of proc-binding */ + ); + typedef ompd_rc_t (*ompd_is_implicit_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: implicit=1, explicit=0 */ + ); +/* --- 8.3 OMPT Task Inquiry Analogues -------------------------------------- */ + +/** + * The functions defined here are third-party versions of ompt_get_task_frame + * and ompt_get_task_id. The only difference between the OMPD and OMPT + * counterparts is that the OMPD version must supply a task handle to provide a + * context for these inquiries. + */ +/** + * sp_exit + * + * This value is set once, the first time that a task exits the runtime to begin + * executing user code. This field points to the stack frame of the runtime + * procedure that called the user code. This value is NULL until just before the + * task exits the runtime. + * + * sp_reentry + * + * This value is set each time that current task re-enters the runtime to create + * new (implicit or explicit) tasks. This field points to the stack frame of the + * runtime procedure called by a task to re-enter the runtime. This value is NULL + * until just after the task re-enters the runtime. + */ +typedef ompd_rc_t (*ompd_get_task_frame_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_address_t *sp_exit, /* OUT: next frame is user code */ + ompd_address_t *sp_reentry /* OUT: previous frame is user code */ + ); +#if 0 +typedef ompd_rc_t (*ompd_get_task_id_fn_t) ( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_id_t *task_id /* OUT: OpenMP task ID */ + ); +#endif +/* --- 13 Display Control Variables ----------------------------------------- */ + +/** + * Using the ompd_display_control_vars function, the debugger can extract a + * string that contains a sequence of name/value pairs of control variables + * whose settings are (a) user controllable, and (b) important to the operation + * or performance of an OpenMP runtime system. The control variables exposed + * through this interface will include all of the OMP environment variables, + * settings that may come from vendor or platform- specific environment + * variables (e.g., the IBM XL compiler has an environment variable that + * controls spinning vs. blocking behavior), and other settings that affect + * the operation or functioning of an OpenMP runtime system (e.g., numactl + * settings that cause threads to be bound to cores). + */ +typedef ompd_rc_t (*ompd_get_display_control_vars_fn_t) ( + ompd_address_space_handle_t *handle, /* IN */ + const char * const **control_var_values /* OUT */ +); +typedef ompd_rc_t (*ompd_release_display_control_vars_fn_t) ( + const char * const **control_var_values /* IN */ +); diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index 5afb2ab47..e53ac2fc1 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -8,8 +8,27 @@ const ompd_callbacks_t *TValue::callbacks = NULL; ompd_target_type_sizes_t TValue::type_sizes; +// MARKER_MR: This is just compat stuff because I dont have time to +// replace this function. TODO: replace this function inline int ompd_sizeof(ompd_target_prim_types_t t) { - return (((int *)&TValue::type_sizes)[(int)t]); + assert(t != ompd_type_max && "ompd_type_max should not be used anywhere"); + assert(t != ompd_type_invalid && "request size of invalid type"); + + switch (t) { + case ompd_type_char: + return TValue::type_sizes.sizeof_char; + case ompd_type_short: + return TValue::type_sizes.sizeof_short; + case ompd_type_int: + return TValue::type_sizes.sizeof_int; + case ompd_type_long: + return TValue::type_sizes.sizeof_long; + case ompd_type_long_long: + return TValue::type_sizes.sizeof_long_long; + case ompd_type_pointer: + return TValue::type_sizes.sizeof_pointer; + } + return 0; } TType &TTypeFactory::getType(ompd_address_space_context_t *context, diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 18356e071..5c4912478 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -32,7 +32,7 @@ uint64_t ompd_state; /* --- 3 Initialization ----------------------------------------------------- */ -ompd_rc_t ompd_initialize(const ompd_callbacks_t *table, ompd_word_t version) { +ompd_rc_t ompd_initialize(ompd_word_t version, const ompd_callbacks_t *table) { ompd_rc_t ret = table ? ompd_rc_ok : ompd_rc_bad_input; callbacks = table; TValue::callbacks = table; @@ -57,6 +57,7 @@ ompd_process_initialize(ompd_address_space_context_t ompd_rc_t ret = initTypeSizes(context); if (ret != ompd_rc_ok) return ret; +#if 0 ret = TValue(context, "ompd_rtl_version") .castBase(ompd_type_int) .getValue(rtl_version); @@ -70,6 +71,7 @@ ompd_process_initialize(ompd_address_space_context_t .getValue(ompd_state); if (ret != ompd_rc_ok) return ret; +#endif *addrhandle = new ompd_address_space_handle_t; if (!addrhandle) return ompd_rc_error; @@ -121,6 +123,56 @@ ompd_rc_t ompd_release_address_space_handle( return ompd_rc_ok; } +ompd_rc_t ompd_device_initialize( + ompd_address_space_handle_t *process_handle, + ompd_address_space_context_t *device_context, + int kind, + ompd_size_t sizeof_id, + void *id, + ompd_address_space_handle_t **device_handle + ) +{ + if (!device_context) + return ompd_rc_bad_input; + + // TODO:(mr) primitive type sizes can be different on devices? Think about implementing that + + ompd_rc_t ret; + uint64_t ompd_num_cuda_devices; + + ret = TValue(process_handle->context, "ompd_num_cuda_devices"). + castBase(ompd_type_long_long). + getValue(ompd_num_cuda_devices); + if (ret != ompd_rc_ok) + return ret; + + + for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) { + uint64_t cuda_ctx; + + ret = TValue(process_handle->context, "ompd_CudaContextArray"). + cast("ompd_cuda_context_ptr_t",1). + getArrayElement(i). + castBase(ompd_type_long_long). + getValue(cuda_ctx); + + if ( ret != ompd_rc_ok ) + continue; + + if (cuda_ctx == (*((uint64_t *)id))) { + *device_handle = new ompd_address_space_handle_t; + if (!device_handle) + return ompd_rc_error; + (*device_handle)->context = device_context; + (*device_handle)->kind = ompd_device_kind_cuda; + (*device_handle)->id = (uint64_t)id; + return ompd_rc_ok; + } + } + + return ompd_rc_unavailable; +} + #if 0 // no device support yet ompd_rc_t ompd_device_initialize ( ompd_address_space_context_t *context, /* IN: */ @@ -463,7 +515,7 @@ ompd_rc_t ompd_get_parallel_handle_string_id ( /* task_handle is of type (kmp_taskdata_t) */ -ompd_rc_t ompd_get_current_task__handle( +ompd_rc_t ompd_get_current_task_handle( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ ) { @@ -511,7 +563,7 @@ ompd_rc_t ompd_get_current_task__handle( return ompd_rc_ok; } -ompd_rc_t ompd_get_generating_ancestor_task_handle( +ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ) { @@ -571,7 +623,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle( return ret; } -ompd_rc_t ompd_get_scheduling_ancestor_task_handle( +ompd_rc_t ompd_get_scheduling_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ) { @@ -831,8 +883,10 @@ ompd_rc_t ompd_get_parallel_data( ompd_address_space_context_t *context = parallel_handle->ah->context; if (!context) return ompd_rc_stale_handle; +#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; +#endif assert(callbacks && "Callback table not initialized!"); @@ -1059,8 +1113,10 @@ ompd_rc_t ompd_get_state( ompd_address_space_context_t *context = thread_handle->ah->context; if (!context) return ompd_rc_stale_handle; +#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; +#endif assert(callbacks && "Callback table not initialized!"); @@ -1341,8 +1397,10 @@ ompd_rc_t ompd_get_task_frame( ompd_address_space_context_t *context = task_handle->ah->context; if (!context) return ompd_rc_stale_handle; +#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; +#endif assert(callbacks && "Callback table not initialized!"); @@ -1387,8 +1445,10 @@ ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ ompd_address_space_context_t *context = task_handle->ah->context; if (!context) return ompd_rc_stale_handle; +#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; +#endif assert(callbacks && "Callback table not initialized!"); diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 25514bcc6..0fdd1c2d9 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -85,7 +85,7 @@ typedef struct ompd_address_t { #define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) #define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) -#if 0 // types removed in Austin F2F +//#if 0 // types removed in Austin F2F /* * Definition of OMPD states, taken from OMPT */ @@ -132,6 +132,7 @@ typedef enum ompd_state_t { #undef ompd_state_macro } ompd_state_t; +#if 0 typedef enum ompd_sched_t { ompd_sched_static = 1, ompd_sched_dynamic = 2, @@ -425,8 +426,9 @@ ompd_get_api_version_string(const char **string /* OUT: OMPD version string */ * maintain the functions valid for as long as needed. */ ompd_rc_t -ompd_initialize(const ompd_callbacks_t *table, /* IN: callbacks table */ - ompd_word_t version); +ompd_initialize(ompd_word_t version, + const ompd_callbacks_t *table /* IN: callbacks table */ + ); ompd_rc_t ompd_process_initialize(ompd_address_space_context_t @@ -451,12 +453,12 @@ ompd_rc_t ompd_release_address_space_handle( ); ompd_rc_t ompd_device_initialize( - ompd_address_space_context_t - *context, /* IN: debugger handle for the device */ - ompd_device_identifier_t id, /* IN: object defined by native device API */ - ompd_device_kind_t kind, /* IN: */ - ompd_address_space_handle_t * - *addrhandle /* OUT: ompd handle for the device */ + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_address_space_context_t *device_context, + int kind, + ompd_size_t sizeof_id, + void *id, + ompd_address_space_handle_t **device_handle ); ompd_rc_t ompd_finalize(void); @@ -475,13 +477,8 @@ ompd_rc_t ompd_finalize(void); * and/or destroying threads during or after the call, rendering useless the * vector of handles returned. */ -#if 0 -ompd_rc_t ompd_get_threads ( - ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */ - int *num_handles /* OUT: number of handles in the array */ - ); -#endif + + /** * Retrieve handles for OpenMP threads in a parallel region. * @@ -589,7 +586,7 @@ ompd_rc_t ompd_get_parallel_handle_string_id ( * for the innermost task region associated with an OpenMP thread. This call is * meaningful only if the thread whose handle is provided is stopped. */ -ompd_rc_t ompd_get_current_task__handle( +ompd_rc_t ompd_get_current_task_handle( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ ); @@ -609,12 +606,12 @@ ompd_rc_t ompd_get_ancestor_task_handle( ); #endif -ompd_rc_t ompd_get_generating_ancestor_task_handle( +ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ); -ompd_rc_t ompd_get_scheduling_ancestor_task_handle( +ompd_rc_t ompd_get_scheduling_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ); @@ -762,7 +759,7 @@ ompd_rc_t ompd_get_thread_num( * different states of threads (e.g., idle, working, or barrier, etc) and what * entity cased this state (e.g., address of a lock); * - * The function ompd_get_state is a third-party version of ompt_get_state. The + * The function ompd_get_state is a third-party version of ompt_get_state. The * only difference between the OMPD and OMPT counterparts is that the OMPD * version must supply a thread handle to provide a context for this inquiry. */ diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt index 3d9c78a72..8b721c0e7 100644 --- a/libomptarget/CMakeLists.txt +++ b/libomptarget/CMakeLists.txt @@ -53,6 +53,12 @@ if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) add_definitions(-O0) endif() +# OMPD support for libomptarget (currently only with cuda) +set(LIBOMPTARGET_OMPD_SUPPORT FALSE CACHE BOOL "OMPD-support?") +if (LIBOMPTARGET_OMPD_SUPPORT) + add_definitions(-DOMPD_SUPPORT=1) +endif() + include_directories(include) # Build target agnostic offloading library. diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp index fe2f9f67c..90b2cd6e9 100644 --- a/libomptarget/plugins/cuda/src/rtl.cpp +++ b/libomptarget/plugins/cuda/src/rtl.cpp @@ -54,6 +54,19 @@ static int DebugLevel = 0; {} #endif +#if OMPD_SUPPORT +#ifdef __cplusplus +extern "C" { +#endif + /* TODO - Put these OMPD globals someplace cleaner */ + uint64_t ompd_num_cuda_devices; + CUcontext* ompd_CudaContextArray; +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* OMPD_SUPPORT */ + + /// Keep entries table per device. struct FuncOrGblEntryTy { __tgt_target_table Table; @@ -204,6 +217,10 @@ class RTLDeviceInfoTy { FuncGblEntries.resize(NumberOfDevices); Contexts.resize(NumberOfDevices); +#if OMPD_SUPPORT + ompd_num_cuda_devices = (uint64_t)Contexts.size(); + ompd_CudaContextArray = &Contexts[0]; +#endif /* OMPD_SUPPORT */ ThreadsPerBlock.resize(NumberOfDevices); BlocksPerGrid.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); diff --git a/runtime/src/ompd-specific.cpp b/runtime/src/ompd-specific.cpp index 89892e00c..ff415cfa3 100644 --- a/runtime/src/ompd-specific.cpp +++ b/runtime/src/ompd-specific.cpp @@ -23,45 +23,45 @@ OMPD_FOREACH_SIZEOF(ompd_declare_sizeof) #undef ompd_declare_sizeof const char * * ompd_dll_locations=NULL; -const char * ompd_my_dll_locations[2] = {"libompd_intel.so",NULL}; -uint64_t ompd_state=0; + const char * ompd_my_dll_locations[2] = {"libompd_intel.so",NULL}; + uint64_t ompd_state=0; -int ompd_rtl_version = 7; + int ompd_rtl_version = 7; -void ompd_init() -{ - -static int ompd_initialized = 0; + void ompd_init() + { + + static int ompd_initialized = 0; -if (ompd_initialized) - return; - -/** - * Calculate member offsets for structs and unions - */ + if (ompd_initialized) + return; + + /** + * Calculate member offsets for structs and unions + */ #define ompd_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); -OMPD_FOREACH_ACCESS(ompd_init_access) + OMPD_FOREACH_ACCESS(ompd_init_access) #undef ompd_init_access -/** - * Create bit mask for bitfield access - */ + /** + * Create bit mask for bitfield access + */ #define ompd_init_bitfield(t,m) ompd_bitfield__##t##__##m=0; ((t*)(&ompd_bitfield__##t##__##m))->m = 1; -OMPD_FOREACH_BITFIELD(ompd_init_bitfield) + OMPD_FOREACH_BITFIELD(ompd_init_bitfield) #undef ompd_init_bitfield -/** - * Calculate type size information - */ + /** + * Calculate type size information + */ #define ompd_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); -OMPD_FOREACH_ACCESS(ompd_init_sizeof_member) + OMPD_FOREACH_ACCESS(ompd_init_sizeof_member) #undef ompd_init_sizeof_member #define ompd_init_sizeof(t) ompd_sizeof__##t = sizeof(t); -OMPD_FOREACH_SIZEOF(ompd_init_sizeof) + OMPD_FOREACH_SIZEOF(ompd_init_sizeof) #undef ompd_init_sizeof ompd_dll_locations=ompd_my_dll_locations; diff --git a/runtime/src/ompd-specific.h b/runtime/src/ompd-specific.h index 8abf0848d..75995a495 100644 --- a/runtime/src/ompd-specific.h +++ b/runtime/src/ompd-specific.h @@ -107,6 +107,9 @@ OMPD_BITFIELD(kmp_tasking_flags_t, freed) \ OMPD_BITFIELD(kmp_tasking_flags_t, native) \ +// TODO: (mr) this is a hack to cast cuda contexts to 64 bit values +typedef uint64_t ompd_cuda_context_ptr_t; + #define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \ OMPD_SIZEOF(kmp_info_t) \ OMPD_SIZEOF(kmp_taskdata_t) \ @@ -118,6 +121,7 @@ OMPD_SIZEOF(__kmp_avail_proc) \ OMPD_SIZEOF(__kmp_max_nth) \ OMPD_SIZEOF(__kmp_gtid) \ OMPD_SIZEOF(__kmp_nth) \ +OMPD_SIZEOF(ompd_cuda_context_ptr_t) \ #endif /* OMPD_SUPPORT */ #endif From 0c3427ebcc7fbdad82914203cbd0d115168c1b3c Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 29 Jun 2018 10:38:41 -0700 Subject: [PATCH 04/64] Add ompd_get_thread_handle for cuda. * Code adapted from Marty Mcfadden's () code for the clang-ykt runtime. * OMPD code in the nvptx device runtime library in libomptarget is currently included regardless of compile flags/defines --- libompd/gdb-wrapper/OMPDCommand.cpp | 2 +- libompd/src/omp-debug.cpp | 1192 ++++++++--------- libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 1 + .../deviceRTLs/nvptx/src/ompd-specific.cu | 62 + .../deviceRTLs/nvptx/src/ompd-specific.h | 30 + .../deviceRTLs/nvptx/src/omptarget-nvptx.cu | 6 + .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 10 + 7 files changed, 679 insertions(+), 624 deletions(-) create mode 100644 libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu create mode 100644 libomptarget/deviceRTLs/nvptx/src/ompd-specific.h diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 647ecf109..bfbfc4aa0 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -217,7 +217,7 @@ void OMPDThreads::execute() const for(auto i: cuda.threads) { if (!device_initialized[i.coord.cudaContext]) { - cout << "Cuda device with context " << i.coord.cudaContext << "not initialized as OpenMP device. Trying to initialize\n"; + cout << "Cuda device with context " << i.coord.cudaContext << " not initialized as OpenMP device. Trying to initialize\n"; OMPDCudaContextPool* cpool; cpool = new OMPDCudaContextPool(&i); ompd_rc_t result; diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 5c4912478..a163e143c 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -150,6 +150,7 @@ ompd_rc_t ompd_device_initialize( for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) { uint64_t cuda_ctx; + // TODO: (mr) think of a better way to cast contexts ret = TValue(process_handle->context, "ompd_CudaContextArray"). cast("ompd_cuda_context_ptr_t",1). getArrayElement(i). @@ -173,64 +174,6 @@ ompd_rc_t ompd_device_initialize( return ompd_rc_unavailable; } -#if 0 // no device support yet -ompd_rc_t ompd_device_initialize ( - ompd_address_space_context_t *context, /* IN: */ - ompd_device_identifier_t id, /* IN: object defined by native device API */ - ompd_device_kind_t kind, /* IN: */ - ompd_address_space_handle_t **addrhandle /* OUT: ompd handle for the device */ - ) -{ - if (!context) - return ompd_rc_bad_input; - - ompd_rc_t ret = initTypeSizes(context); - if (ret != ompd_rc_ok) - return ret; - - uint64_t ompd_num_cuda_devices; - - ompd_address_space_context_t *process_context; - ret = callbacks->get_containing_process_context(context, &process_context); - if ( ret != ompd_rc_ok ) - return ret; - - ret = TValue(process_context, "ompd_num_cuda_devices"). - castBase(ompd_type_long_long). - getValue(ompd_num_cuda_devices); - if (ret != ompd_rc_ok) { - return ret; - } - - for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) { - uint64_t cuda_ctx; - - /* TODO(mjm) - Hack! Currently using ompt_parallel_id_t. Need to find a - * place to define ID type information for CUDA contexts - */ - ret = TValue(process_context, "ompd_CudaContextArray"). - cast("ompt_parallel_id_t",1). - getArrayElement(i). - castBase(ompd_type_long_long). - getValue(cuda_ctx); - - if ( ret != ompd_rc_ok ) - continue; - - if (cuda_ctx == id) { - *addrhandle = new ompd_address_space_handle_t; - if (!addrhandle) - return ompd_rc_error; - (*addrhandle)->context = context; - - return ompd_rc_ok; - } - } - - /* TODO(mjm) - Find appropriate error return result for not finding a match */ - return ompd_rc_ok; -} -#endif // no device support /* --- 4 Handle Management -------------------------------------------------- */ @@ -294,116 +237,57 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, int *cmp_value) { if (!thread_handle_1) return ompd_rc_stale_handle; - if (!thread_handle_2) - return ompd_rc_stale_handle; - *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; - return ompd_rc_ok; -} + if (!thread_handle_2) + return ompd_rc_stale_handle; + *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; + return ompd_rc_ok; + } #if 0 -ompd_rc_t ompd_get_thread_handle_string_id ( - ompd_thread_handle_t *thread_handle, - char **string_id - ) -{ - pthread_t thread_id; - ompd_rc_t ret; - ret = ompd_get_thread_id(thread_handle, ompd_thread_id_pthread, sizeof(pthread_t), &thread_id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)thread_id); - return ompd_rc_ok; -} + ompd_rc_t ompd_get_thread_handle_string_id ( + ompd_thread_handle_t *thread_handle, + char **string_id + ) + { + pthread_t thread_id; + ompd_rc_t ret; + ret = ompd_get_thread_id(thread_handle, ompd_thread_id_pthread, sizeof(pthread_t), &thread_id); + if (ret!=ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); + if (ret!=ompd_rc_ok) + return ret; + sprintf(*string_id, "0x%llx", (long long)thread_id); + return ompd_rc_ok; + } #endif -/* --- 4.2 Parallel Region Handles------------------------------------------- */ + /* --- 4.2 Parallel Region Handles------------------------------------------- */ -/* parallel_handle is of type (kmp_base_team_t)*/ + /* parallel_handle is of type (kmp_base_team_t)*/ -ompd_rc_t ompd_get_current_parallel_handle( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_parallel_handle_t **parallel_handle /* OUT: OpenMP parallel handle */ - ) { - if (!thread_handle) - return ompd_rc_stale_handle; - if (!thread_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; - - TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_team") /*__kmp_threads[t]->th.th_team*/ - .cast("kmp_team_p", 1) - .access("t"); /*__kmp_threads[t]->th.th_team->t*/ - - ompd_rc_t ret = teamdata.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = teamdata.cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; - - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(parallel_handle)); - if (ret != ompd_rc_ok) - return ret; - - (*parallel_handle)->ah = thread_handle->ah; - (*parallel_handle)->th = taddr; - (*parallel_handle)->lwt = lwt; - return ompd_rc_ok; -} - -ompd_rc_t ompd_get_enclosing_parallel_handle( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_parallel_handle_t * - *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_current_parallel_handle( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_parallel_handle_t **parallel_handle /* OUT: OpenMP parallel handle */ + ) { + if (!thread_handle) + return ompd_rc_stale_handle; + if (!thread_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = thread_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = parallel_handle->th, lwt; - - ompd_rc_t ret = ompd_rc_stale_handle; - TValue lwtValue = TValue(context, parallel_handle->lwt); - if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 - { // if we are in lwt, get parent - ret = lwtValue.cast("ompt_lw_taskteam_t", 0) - .access("parent") - .cast("ompt_lw_taskteam_t", 1) - .dereference() - .getAddress(&lwt); - } - if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr, lwt; - TValue teamdata = - TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_parent") /*t.t_parent*/ - .cast("kmp_team_p", 1) - .access("t"); /*t.t_parent->t*/ + TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_team") /*__kmp_threads[t]->th.th_team*/ + .cast("kmp_team_p", 1) + .access("t"); /*__kmp_threads[t]->th.th_team->t*/ - ret = teamdata.getAddress(&taddr); + ompd_rc_t ret = teamdata.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; @@ -414,188 +298,188 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( .getValue(lwt.address); if (ret != ompd_rc_ok) return ret; - } - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(enclosing_parallel_handle)); - if (ret != ompd_rc_ok) - return ret; - (*enclosing_parallel_handle)->th = taddr; - (*enclosing_parallel_handle)->lwt = lwt; - (*enclosing_parallel_handle)->ah = parallel_handle->ah; - return ompd_rc_ok; -} + ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; -ompd_rc_t ompd_get_task_parallel_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_parallel_handle_t * - *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; + (*parallel_handle)->ah = thread_handle->ah; + (*parallel_handle)->th = taddr; + (*parallel_handle)->lwt = lwt; + return ompd_rc_ok; + } - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_enclosing_parallel_handle( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_parallel_handle_t * + *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + if (!context) + return ompd_rc_stale_handle; - ompd_rc_t ret; - ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .getAddress(&taddr); + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr = parallel_handle->th, lwt; + + ompd_rc_t ret = ompd_rc_stale_handle; + TValue lwtValue = TValue(context, parallel_handle->lwt); + if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 + { // if we are in lwt, get parent + ret = lwtValue.cast("ompt_lw_taskteam_t", 0) + .access("parent") + .cast("ompt_lw_taskteam_t", 1) + .dereference() + .getAddress(&lwt); + } + if (ret != ompd_rc_ok) { // no lwt or parent==0x0 - if (ret != ompd_rc_ok) - return ret; + TValue teamdata = + TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_parent") /*t.t_parent*/ + .cast("kmp_team_p", 1) + .access("t"); /*t.t_parent->t*/ + + ret = teamdata.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = teamdata.cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + } - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(enclosing_parallel_handle)); - if (ret != ompd_rc_ok) - return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(enclosing_parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + (*enclosing_parallel_handle)->th = taddr; + (*enclosing_parallel_handle)->lwt = lwt; + (*enclosing_parallel_handle)->ah = parallel_handle->ah; + return ompd_rc_ok; + } - (*enclosing_parallel_handle)->ah = task_handle->ah; - (*enclosing_parallel_handle)->lwt = task_handle->lwt; - (*enclosing_parallel_handle)->th = taddr; - return ompd_rc_ok; -} + ompd_rc_t ompd_get_task_parallel_handle( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_parallel_handle_t * + *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; -ompd_rc_t ompd_release_parallel_handle( - ompd_parallel_handle_t *parallel_handle /* IN: OpenMP parallel handle */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle)); - if (ret != ompd_rc_ok) - return ret; - return ompd_rc_ok; -} + if (!context) + return ompd_rc_stale_handle; -ompd_rc_t -ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, - ompd_parallel_handle_t *parallel_handle_2, - int *cmp_value) { - if (!parallel_handle_1) - return ompd_rc_stale_handle; - if (!parallel_handle_2) - return ompd_rc_stale_handle; - if (parallel_handle_1->th.address - parallel_handle_2->th.address) - *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; - else - *cmp_value = - parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; - return ompd_rc_ok; -} + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr; -#if 0 // parallel-id is initialized to zero -ompd_rc_t ompd_get_parallel_handle_string_id ( - ompd_parallel_handle_t *parallel_handle, - char **string_id - ) -{ - ompd_parallel_id_t id; ompd_rc_t ret; - ret = ompd_get_parallel_id(parallel_handle, &id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)id); - return ompd_rc_ok; -} -#endif - -/* --- 4.3 Task Handles ----------------------------------------------------- */ + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .getAddress(&taddr); -/* task_handle is of type (kmp_taskdata_t) */ + if (ret != ompd_rc_ok) + return ret; -ompd_rc_t ompd_get_current_task_handle( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ - ) { - if (!thread_handle) - return ompd_rc_stale_handle; - if (!thread_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(enclosing_parallel_handle)); + if (ret != ompd_rc_ok) + return ret; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; + (*enclosing_parallel_handle)->ah = task_handle->ah; + (*enclosing_parallel_handle)->lwt = task_handle->lwt; + (*enclosing_parallel_handle)->th = taddr; + return ompd_rc_ok; + } - TValue taskdata = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ - .cast("kmp_taskdata_t", 1); + ompd_rc_t ompd_release_parallel_handle( + ompd_parallel_handle_t *parallel_handle /* IN: OpenMP parallel handle */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + return ompd_rc_ok; + } - ompd_rc_t ret = taskdata.dereference().getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; + ompd_rc_t + ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, + ompd_parallel_handle_t *parallel_handle_2, + int *cmp_value) { + if (!parallel_handle_1) + return ompd_rc_stale_handle; + if (!parallel_handle_2) + return ompd_rc_stale_handle; + if (parallel_handle_1->th.address - parallel_handle_2->th.address) + *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; + else + *cmp_value = + parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; + return ompd_rc_ok; + } - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = taskdata - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; +#if 0 // parallel-id is initialized to zero + ompd_rc_t ompd_get_parallel_handle_string_id ( + ompd_parallel_handle_t *parallel_handle, + char **string_id + ) + { + ompd_parallel_id_t id; + ompd_rc_t ret; + ret = ompd_get_parallel_id(parallel_handle, &id); + if (ret!=ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); + if (ret!=ompd_rc_ok) + return ret; + sprintf(*string_id, "0x%llx", (long long)id); + return ompd_rc_ok; + } +#endif - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(task_handle)); - if (ret != ompd_rc_ok) - return ret; + /* --- 4.3 Task Handles ----------------------------------------------------- */ - (*task_handle)->th = taddr; - (*task_handle)->lwt = lwt; - (*task_handle)->ah = thread_handle->ah; - return ompd_rc_ok; -} + /* task_handle is of type (kmp_taskdata_t) */ -ompd_rc_t ompd_get_generating_task_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_current_task_handle( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ + ) { + if (!thread_handle) + return ompd_rc_stale_handle; + if (!thread_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = thread_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = task_handle->th, lwt; - - ompd_rc_t ret = ompd_rc_stale_handle; - TValue lwtValue = TValue(context, task_handle->lwt); - if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 - { // if we are in lwt, get parent - ret = lwtValue.cast("ompt_lw_taskteam_t", 0) - .access("parent") - .cast("ompt_lw_taskteam_t", 1) - .dereference() - .getAddress(&lwt); - } - if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr, lwt; - TValue taskdata = TValue(context, task_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_taskdata_t") /*td*/ - .access("td_parent") /*td->td_parent*/ - .cast("kmp_taskdata_t", 1); + TValue taskdata = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ + .cast("kmp_taskdata_t", 1); - ret = taskdata.dereference().getAddress(&taddr); + ompd_rc_t ret = taskdata.dereference().getAddress(&taddr); if (ret != ompd_rc_ok) return ret; @@ -610,378 +494,437 @@ ompd_rc_t ompd_get_generating_task_handle( .getValue(lwt.address); if (ret != ompd_rc_ok) return ret; - } - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(parent_task_handle)); - if (ret != ompd_rc_ok) - return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(task_handle)); + if (ret != ompd_rc_ok) + return ret; - (*parent_task_handle)->th = taddr; - (*parent_task_handle)->lwt = lwt; - (*parent_task_handle)->ah = task_handle->ah; - return ret; -} + (*task_handle)->th = taddr; + (*task_handle)->lwt = lwt; + (*task_handle)->ah = thread_handle->ah; + return ompd_rc_ok; + } -ompd_rc_t ompd_get_scheduling_task_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_generating_task_handle( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr = task_handle->th, lwt; + + ompd_rc_t ret = ompd_rc_stale_handle; + TValue lwtValue = TValue(context, task_handle->lwt); + if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 + { // if we are in lwt, get parent + ret = lwtValue.cast("ompt_lw_taskteam_t", 0) + .access("parent") + .cast("ompt_lw_taskteam_t", 1) + .dereference() + .getAddress(&lwt); + } + if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + + TValue taskdata = TValue(context, task_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_taskdata_t") /*td*/ + .access("td_parent") /*td->td_parent*/ + .cast("kmp_taskdata_t", 1); + + ret = taskdata.dereference().getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = taskdata + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + } - ompd_rc_t ret = - TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("ompt_task_info") // td->ompt_task_info - .cast("ompt_task_info_t") - .access("scheduling_parent") // td->ompd_task_info.scheduling_parent - .cast("kmp_taskdata_t", 1) - .dereference() - .getAddress(&taddr); + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(parent_task_handle)); + if (ret != ompd_rc_ok) + return ret; - if (ret != ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(parent_task_handle)); - if (ret != ompd_rc_ok) + (*parent_task_handle)->th = taddr; + (*parent_task_handle)->lwt = lwt; + (*parent_task_handle)->ah = task_handle->ah; return ret; + } - (*parent_task_handle)->th = taddr; - (*parent_task_handle)->ah = task_handle->ah; - return ret; -} - -ompd_rc_t ompd_get_task_in_parallel( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - int nth_handle, /* OUT: number of the task handle */ - ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ - ) { - int i; - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_scheduling_task_handle( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr; - ompd_rc_t ret; - ompd_address_t taddr; - ret = TValue(context, parallel_handle->th) /* t */ - .cast("kmp_base_team_t", 0) - .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ + ompd_rc_t ret = + TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("ompt_task_info") // td->ompt_task_info + .cast("ompt_task_info_t") + .access("scheduling_parent") // td->ompd_task_info.scheduling_parent .cast("kmp_taskdata_t", 1) - .getArrayElement( - nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ + .dereference() .getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(task_handle)); - if (ret != ompd_rc_ok) + if (ret != ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(parent_task_handle)); + if (ret != ompd_rc_ok) + return ret; + + (*parent_task_handle)->th = taddr; + (*parent_task_handle)->ah = task_handle->ah; return ret; + } - (*task_handle)->th = taddr; - (*task_handle)->ah = parallel_handle->ah; - return ret; -} + ompd_rc_t ompd_get_task_in_parallel( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + int nth_handle, /* OUT: number of the task handle */ + ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ + ) { + int i; + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; -ompd_rc_t ompd_release_task_handle( - ompd_task_handle_t *task_handle /* IN: OpenMP task handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle)); - if (ret != ompd_rc_ok) - return ret; - return ompd_rc_ok; -} + assert(callbacks && "Callback table not initialized!"); -ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, - ompd_task_handle_t *task_handle_2, - int *cmp_value) { - if (!task_handle_1) - return ompd_rc_stale_handle; - if (!task_handle_2) - return ompd_rc_stale_handle; - if (task_handle_1->th.address - task_handle_2->th.address) - *cmp_value = task_handle_1->th.address - task_handle_2->th.address; - else - *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; - return ompd_rc_ok; -} + ompd_rc_t ret; + ompd_address_t taddr; + ret = TValue(context, parallel_handle->th) /* t */ + .cast("kmp_base_team_t", 0) + .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ + .cast("kmp_taskdata_t", 1) + .getArrayElement( + nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ + .getAddress(&taddr); -#if 0 // all task ids are initialized to zero -ompd_rc_t ompd_get_task_handle_string_id ( - ompd_task_handle_t *task_handle, - char **string_id - ) -{ - ompd_task_id_t id; - ompd_rc_t ret = ompd_get_task_id(task_handle, &id); - if (ret!=ompd_rc_ok) + if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(task_handle)); + if (ret != ompd_rc_ok) + return ret; + + (*task_handle)->th = taddr; + (*task_handle)->ah = parallel_handle->ah; + return ret; + } + + ompd_rc_t ompd_release_task_handle( + ompd_task_handle_t *task_handle /* IN: OpenMP task handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle)); + if (ret != ompd_rc_ok) return ret; - sprintf(*string_id, "0x%llx", (long long)id); return ompd_rc_ok; -} -#endif + } -/* --- 5 Process and Thread Settings ---------------------------------------- */ + ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, + ompd_task_handle_t *task_handle_2, + int *cmp_value) { + if (!task_handle_1) + return ompd_rc_stale_handle; + if (!task_handle_2) + return ompd_rc_stale_handle; + if (task_handle_1->th.address - task_handle_2->th.address) + *cmp_value = task_handle_1->th.address - task_handle_2->th.address; + else + *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; + return ompd_rc_ok; + } -ompd_rc_t -ompd_get_num_procs(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; +#if 0 // all task ids are initialized to zero + ompd_rc_t ompd_get_task_handle_string_id ( + ompd_task_handle_t *task_handle, + char **string_id + ) + { + ompd_task_id_t id; + ompd_rc_t ret = ompd_get_task_id(task_handle, &id); + if (ret!=ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); + if (ret!=ompd_rc_ok) + return ret; + sprintf(*string_id, "0x%llx", (long long)id); + return ompd_rc_ok; + } +#endif - if (!context) - return ompd_rc_stale_handle; + /* --- 5 Process and Thread Settings ---------------------------------------- */ - assert(callbacks && "Callback table not initialized!"); + ompd_rc_t + ompd_get_num_procs(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: number of processes */ + ) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; - int nth; - ret = TValue(context, "__kmp_avail_proc") - .castBase("__kmp_avail_proc") - .getValue(nth); - *val = nth; - return ret; -} + if (!context) + return ompd_rc_stale_handle; -ompd_rc_t -ompd_get_thread_limit(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; + assert(callbacks && "Callback table not initialized!"); - if (!context) - return ompd_rc_stale_handle; + int nth; + ret = TValue(context, "__kmp_avail_proc") + .castBase("__kmp_avail_proc") + .getValue(nth); + *val = nth; + return ret; + } - assert(callbacks && "Callback table not initialized!"); + ompd_rc_t + ompd_get_thread_limit(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; - int nth; - ret = - TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); - *val = nth; - return ret; -} + if (!context) + return ompd_rc_stale_handle; -/* --- 6 Parallel Region Inqueries ------------------------------------------ */ -/* --- 6.1 Settings --------------------------------------------------------- */ + assert(callbacks && "Callback table not initialized!"); -ompd_rc_t ompd_get_num_threads( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + int nth; + ret = + TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); + *val = nth; + return ret; + } - assert(callbacks && "Callback table not initialized!"); + /* --- 6 Parallel Region Inqueries ------------------------------------------ */ + /* --- 6.1 Settings --------------------------------------------------------- */ - ompd_rc_t ret = ompd_rc_ok; - if (parallel_handle->lwt.address != 0) - *val = 1; - else { - uint32_t res; - ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_nproc") /*t.t_nproc*/ - .castBase() - .getValue(res); - *val = res; + ompd_rc_t ompd_get_num_threads( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: number of threads */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = ompd_rc_ok; + if (parallel_handle->lwt.address != 0) + *val = 1; + else { + uint32_t res; + ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_nproc") /*t.t_nproc*/ + .castBase() + .getValue(res); + *val = res; + } + return ret; } - return ret; -} -ompd_rc_t ompd_get_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: nesting level */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); + assert(callbacks && "Callback table not initialized!"); - uint32_t res; + uint32_t res; - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_level") /*t.t_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; -} + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_level") /*t.t_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; + } -ompd_rc_t ompd_get_active_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_active_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: active nesting level */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); + assert(callbacks && "Callback table not initialized!"); - uint32_t res; + uint32_t res; - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_active_level") /*t.t_active_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; -} + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_active_level") /*t.t_active_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; + } -/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ + /* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ -ompd_rc_t ompd_get_parallel_data( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *data /* OUT: OpenMP parallel id */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ompd_rc_t ompd_get_parallel_data( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_address_t *data /* OUT: OpenMP parallel id */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; #if 0 - if (!ompd_state) - return ompd_rc_needs_state_tracking; + if (!ompd_state) + return ompd_rc_needs_state_tracking; #endif - assert(callbacks && "Callback table not initialized!"); - - TValue teamInfo; - if (parallel_handle->lwt.address != 0) - teamInfo = TValue(context, parallel_handle->lwt) - .cast("ompt_lw_taskteam_t", 0); /*lwt*/ - else - teamInfo = - TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/ - ompd_rc_t ret = teamInfo - .access("ompt_team_info") /*t.ompt_team_info*/ - .cast("ompt_team_info_t", 0) - .access("parallel_data") /*t.ompt_team_info.parallel_id*/ - .getAddress(data); - return ret; -} + assert(callbacks && "Callback table not initialized!"); + + TValue teamInfo; + if (parallel_handle->lwt.address != 0) + teamInfo = TValue(context, parallel_handle->lwt) + .cast("ompt_lw_taskteam_t", 0); /*lwt*/ + else + teamInfo = + TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/ + ompd_rc_t ret = teamInfo + .access("ompt_team_info") /*t.ompt_team_info*/ + .cast("ompt_team_info_t", 0) + .access("parallel_data") /*t.ompt_team_info.parallel_id*/ + .getAddress(data); + return ret; + } #if 0 // there is no such thing as a parallel function -ompd_rc_t ompd_get_parallel_function( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */ - ) -{ - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - if (!ompd_state) - return ompd_rc_needs_state_tracking; - - assert(callbacks && "Callback table not initialized!"); - parallel_addr->segment = OMPD_SEGMENT_UNSPECIFIED; - - TValue teamInfo; - if(parallel_handle->lwt.address!=0) - teamInfo = TValue(context, parallel_handle->lwt). - cast("ompt_lw_taskteam_t",0); /*lwt*/ - else - teamInfo = TValue(context, parallel_handle->th). - cast("kmp_base_team_t",0); /*t*/ - ompd_rc_t ret = teamInfo. - access("ompt_team_info"). /*t.ompt_team_info*/ - cast("ompt_team_info_t",0). - access("microtask"). /*t.ompt_team_info.microtask*/ - castBase(). - getValue(parallel_addr->address); - return ret; -} + ompd_rc_t ompd_get_parallel_function( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */ + ) + { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + if (!ompd_state) + return ompd_rc_needs_state_tracking; + + assert(callbacks && "Callback table not initialized!"); + parallel_addr->segment = OMPD_SEGMENT_UNSPECIFIED; + + TValue teamInfo; + if(parallel_handle->lwt.address!=0) + teamInfo = TValue(context, parallel_handle->lwt). + cast("ompt_lw_taskteam_t",0); /*lwt*/ + else + teamInfo = TValue(context, parallel_handle->th). + cast("kmp_base_team_t",0); /*t*/ + ompd_rc_t ret = teamInfo. + access("ompt_team_info"). /*t.ompt_team_info*/ + cast("ompt_team_info_t",0). + access("microtask"). /*t.ompt_team_info.microtask*/ + castBase(). + getValue(parallel_addr->address); + return ret; + } #endif // no parallel function -/* --- 7 Thread Inquiry ----------------------------------------------------- */ + /* --- 7 Thread Inquiry ----------------------------------------------------- */ -/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ + /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ -ompd_rc_t -ompd_get_thread_handle(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, - ompd_size_t sizeof_thread_id, const void *thread_id, - ompd_thread_handle_t **thread_handle) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; + ompd_rc_t + ompd_get_thread_handle(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_kind_t kind, + ompd_size_t sizeof_thread_id, const void *thread_id, + ompd_thread_handle_t **thread_handle) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; - if (!context) - return ompd_rc_stale_handle; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); - ompd_thread_context_t *tcontext; - ret = callbacks->get_thread_context_for_thread_id( - context, kind, sizeof_thread_id, thread_id, &tcontext); - if (ret != ompd_rc_ok) - return ret; + assert(callbacks && "Callback table not initialized!"); + ompd_thread_context_t *tcontext; + ret = callbacks->get_thread_context_for_thread_id( + context, kind, sizeof_thread_id, thread_id, &tcontext); + if (ret != ompd_rc_ok) + return ret; - int tId; + int tId; - if (kind == ompd_thread_id_cudalogical) { - ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; + if (kind == ompd_thread_id_cudalogical) { + ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; - // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->data.items.threadId + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->data.items.threadId - ret = - TValue(context, tcontext, "omptarget_nvptx_threadPrivateContext", + ret = + TValue(context, tcontext, "omptarget_nvptx_threadPrivateContext", OMPD_SEGMENT_CUDA_PTX_SHARED) .cast("omptarget_nvptx_ThreadPrivateContext", 1, OMPD_SEGMENT_CUDA_PTX_SHARED) .access("topTaskDescr") .cast("omptarget_nvptx_TaskDescr", 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .getArrayElement(p->threadIdx.x) - .access("data__items__threadId") + .access("items__threadId") .castBase(ompd_type_short) .getValue(tId); @@ -989,7 +932,10 @@ ompd_get_thread_handle(ompd_address_space_handle_t return ret; if (tId != p->threadIdx.x) + { + printf("tId(%i) != p->threadIdx.x(%i)\n", tId, p->threadIdx.x); return ompd_rc_stale_handle; + } } else { ret = TValue(context, tcontext, "__kmp_gtid") .castBase("__kmp_gtid") diff --git a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt index 4fc9ef051..4ee4ba27c 100644 --- a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -54,6 +54,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) src/reduction.cu src/sync.cu src/task.cu + src/ompd-specific.cu ) set(omp_data_objects src/omp_data.cu) diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu new file mode 100644 index 000000000..ff5f29862 --- /dev/null +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -0,0 +1,62 @@ +#include "ompd-specific.h" +#ifdef OMPD_SUPPORT + +/** + * Declaration of symbols to hold struct size and member offset information + */ + +__device__ __shared__ static int ompd_target_initialized; + +#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m; +OMPD_FOREACH_ACCESS(ompd_target_declare_access) +#undef ompd_target_declare_access + +#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m; + OMPD_FOREACH_ACCESS(ompd_target_declare_sizeof_member) +#undef ompd_target_declare_sizeof_member + +#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t; + OMPD_FOREACH_SIZEOF(ompd_target_declare_sizeof) +#undef ompd_target_declare_sizeof + +__device__ __shared__ + uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadId; + +__device__ __shared__ + uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadId; + + +__device__ void ompd_init ( void ) +{ + PRINT0(LD_IO, "call to ompd_init\n"); + if (ompd_target_initialized) + return; + + PRINT0(LD_IO, "ompd_init: initializing\n"); + +#define ompd_target_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); + OMPD_FOREACH_ACCESS(ompd_target_init_access) +#undef ompd_target_init_access + + ompd_access__omptarget_nvptx_TaskDescr__items__threadId = + (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadId); + +#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); + OMPD_FOREACH_ACCESS(ompd_target_init_sizeof_member) +#undef ompd_target_init_sizeof_member + + ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadId = + (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadId); + +#define ompd_target_init_sizeof(t) ompd_sizeof__##t = sizeof(t); + OMPD_FOREACH_SIZEOF(ompd_target_init_sizeof) +#undef ompd_target_init_sizeof + + ompd_target_initialized = 1; +} + +__device__ void ompd_bp_parallel_begin (){ asm (""); } +__device__ void ompd_bp_parallel_end (){ asm (""); } +__device__ void ompd_bp_task_begin (){ asm (""); } +__device__ void ompd_bp_task_end (){ asm (""); } +#endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h new file mode 100644 index 000000000..cace73475 --- /dev/null +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -0,0 +1,30 @@ +//TODO: (mr) move this to cmake file +#define OMPD_SUPPORT 1 + +#ifdef OMPD_SUPPORT + +#include "omptarget-nvptx.h" +#include "state-queue.h" +#include "option.h" +#include + +#ifndef __OMPD_SPECIFIC_H__ +#define __OMPD_SPECIFIC_H__ + + +__device__ void ompd_init( void ); +extern "C" __device__ void ompd_bp_parallel_begin ( void ); +extern "C" __device__ void ompd_bp_parallel_end ( void ); +extern "C" __device__ void ompd_bp_task_begin ( void ); +extern "C" __device__ void ompd_bp_task_end ( void ); + + +#define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext, topTaskDescr) \ + +#define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \ + OMPD_SIZEOF(omptarget_nvptx_ThreadPrivateContext)\ + OMPD_SIZEOF(omptarget_nvptx_TaskDescr) + +#endif /* OMPD_SUPPORT */ +#endif diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index 677654dd5..294647fdd 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -92,6 +92,9 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); currTaskDescr->ThreadLimit() = ThreadLimit; +#ifdef OMPD_SUPPORT + ompd_init(); +#endif /*OMPD_SUPPORT*/ } EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { @@ -173,6 +176,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, DataSharingState.SlotPtr[WID] = RootS; DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; } +#ifdef OMPD_SUPPORT + ompd_init(); +#endif /*OMPD_SUPPORT*/ } EXTERN void __kmpc_spmd_kernel_deinit() { diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 2bc5819e6..f75b70fff 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -32,6 +32,7 @@ #include "option.h" // choices we have #include "state-queue.h" #include "support.h" +#include "ompd-specific.h" #define OMPTARGET_NVPTX_VERSION 1.1 @@ -66,6 +67,9 @@ // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { +#if OMPD_SUPPORT + friend void __device__ ompd_init( void ); +#endif /* OMPD_SUPPORT */ public: // All these methods must be called by the master thread only. INLINE void Init() { @@ -150,6 +154,9 @@ extern __device__ __shared__ DataSharingStateTy DataSharingState; // task ICV and (implicit & explicit) task state class omptarget_nvptx_TaskDescr { +#if OMPD_SUPPORT + friend void __device__ ompd_init( void ); +#endif /* OMPD_SUPPORT */ public: // methods for flags INLINE omp_sched_t GetRuntimeSched(); @@ -310,6 +317,9 @@ class omptarget_nvptx_TeamDescr { // tid refers here to the global thread id // do not support multiple concurrent kernel a this time class omptarget_nvptx_ThreadPrivateContext { +#if OMPD_SUPPORT + friend void __device__ ompd_init( void ); +#endif /* OMPD_SUPPORT */ public: // task INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { From 63309d5b65291ec1c072fc439da0da10a4a85019 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 2 Jul 2018 14:48:05 -0700 Subject: [PATCH 05/64] [OMPD] Adds some support for omp states on cuda * libompd can retrieve thread state information for a cuda thread * ODB tries to summarize states of many cuda threads (this is still a bit ugly and doesnt repsect blocks) * Currently openmp cuda threads can only be in an undefined state or parallel work state (parallel work is not consistent) --- libompd/gdb-wrapper/OMPDCommand.cpp | 31 +- libompd/src/omp-debug.cpp | 100 ++- libomptarget/deviceRTLs/nvptx/src/loop.cu | 10 +- .../deviceRTLs/nvptx/src/ompd-specific.cu | 13 +- .../deviceRTLs/nvptx/src/ompd-specific.h | 24 +- .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 748 +++++++++--------- libomptarget/deviceRTLs/nvptx/src/parallel.cu | 9 + 7 files changed, 514 insertions(+), 421 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index bfbfc4aa0..4139e3968 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -202,6 +202,7 @@ void OMPDThreads::execute() const ret = functions->ompd_get_state(thread_handle, &state, &wait_id); printf(" %-12u %p 0x%lx\t%i\t%lx\n", (unsigned int)i.first, thread_handle, i.second, state, wait_id); + functions->ompd_release_thread_handle(thread_handle); } else { @@ -214,10 +215,15 @@ void OMPDThreads::execute() const vector cuda_ContextPools; map device_initialized; map address_spaces; + ompd_word_t last_state = -1; + ompd_cudathread_coord_t last_coords; + + printf("\nCUDA THREADS\n"); + printf("Cuda block from Thread to Thread state\n"); + printf("-------------------------------------------------\n"); for(auto i: cuda.threads) { if (!device_initialized[i.coord.cudaContext]) { - cout << "Cuda device with context " << i.coord.cudaContext << " not initialized as OpenMP device. Trying to initialize\n"; OMPDCudaContextPool* cpool; cpool = new OMPDCudaContextPool(&i); ompd_rc_t result; @@ -233,10 +239,8 @@ void OMPDThreads::execute() const if (result != ompd_rc_ok) { - cout << "Could not initalize device with context " << i.coord.cudaContext << ". Probably not a OpenMP device\n"; continue; } - cout << "Device initialized\n"; address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle; } @@ -249,7 +253,28 @@ void OMPDThreads::execute() const &thread_handle); if (ret == ompd_rc_ok) + { + ompd_word_t state; + functions->ompd_get_state(thread_handle, &state, NULL); + if (last_state == -1) { + last_state = state; + last_coords = i.coord; + printf("(%li,0,0) (%li,%li,%li)", i.coord.blockIdx.x, i.coord.threadIdx.x, i.coord.threadIdx.y, i.coord.threadIdx.z); + } else if (state != last_state || i.coord.blockIdx.x != last_coords.blockIdx.x) { + printf(" (%li,%li,%li) %li\n", last_coords.threadIdx.x, last_coords.threadIdx.y, last_coords.threadIdx.z, last_state); + last_coords = i.coord; + last_state = state; + printf("(%li,0,0) (%li,%li,%li)", i.coord.blockIdx.x, i.coord.threadIdx.x, i.coord.threadIdx.y, i.coord.threadIdx.z); + } else { /* state == last_state*/ + last_coords = i.coord; + } + functions->ompd_release_thread_handle(thread_handle); omp_cuda_threads++; + } + } + + if (last_state != -1) { + printf(" (%i,%i,%i) %i\n", last_coords.threadIdx.x, last_coords.threadIdx.y, last_coords.threadIdx.z, last_state); } if (cuda.threads.size() != 0) { diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index a163e143c..85aa45346 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -146,13 +146,13 @@ ompd_rc_t ompd_device_initialize( if (ret != ompd_rc_ok) return ret; - + for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) { uint64_t cuda_ctx; // TODO: (mr) think of a better way to cast contexts ret = TValue(process_handle->context, "ompd_CudaContextArray"). - cast("ompd_cuda_context_ptr_t",1). + cast("ompd_cuda_context_ptr_t",1). getArrayElement(i). castBase(ompd_type_long_long). getValue(cuda_ctx); @@ -170,7 +170,7 @@ ompd_rc_t ompd_device_initialize( return ompd_rc_ok; } } - + return ompd_rc_unavailable; } @@ -878,7 +878,7 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, access("ompt_team_info"). /*t.ompt_team_info*/ cast("ompt_team_info_t",0). access("microtask"). /*t.ompt_team_info.microtask*/ - castBase(). + castBase(). getValue(parallel_addr->address); return ret; } @@ -914,28 +914,40 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, if (kind == ompd_thread_id_cudalogical) { ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; - // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->data.items.threadId + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->items.threadId + TValue th = TValue(context, tcontext, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("topTaskDescr") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getArrayElement(p->threadIdx.x); + + ompd_address_t taddr; + ret = th.getAddress(&taddr); + + if (ret != ompd_rc_ok) + return ret; - ret = - TValue(context, tcontext, "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED) - .cast("omptarget_nvptx_ThreadPrivateContext", 1, - OMPD_SEGMENT_CUDA_PTX_SHARED) - .access("topTaskDescr") - .cast("omptarget_nvptx_TaskDescr", 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .getArrayElement(p->threadIdx.x) - .access("items__threadId") - .castBase(ompd_type_short) - .getValue(tId); + ret = th.access("items__threadId") + .castBase(ompd_type_short) + .getValue(tId); if (ret != ompd_rc_ok) return ret; if (tId != p->threadIdx.x) - { - printf("tId(%i) != p->threadIdx.x(%i)\n", tId, p->threadIdx.x); return ompd_rc_stale_handle; - } + + ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + (void **)(thread_handle)); + if (ret != ompd_rc_ok) + return ret; + + (*thread_handle)->ah = addr_handle; + (*thread_handle)->th = taddr; } else { ret = TValue(context, tcontext, "__kmp_gtid") .castBase("__kmp_gtid") @@ -1064,26 +1076,38 @@ ompd_rc_t ompd_get_state( return ompd_rc_needs_state_tracking; #endif + ompd_rc_t ret; assert(callbacks && "Callback table not initialized!"); - TValue ompt_thread_info = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("ompt_thread_info") /*__kmp_threads[t]->th.ompt_thread_info*/ - .cast("ompt_thread_info_t"); - if (ompt_thread_info.gotError()) - return ompt_thread_info.getError(); - ompd_rc_t ret = - ompt_thread_info - .access("state") /*__kmp_threads[t]->th.ompt_thread_info.state*/ - .castBase() - .getValue(*state); - if (ret != ompd_rc_ok) - return ret; - ret = ompt_thread_info - .access("wait_id") /*__kmp_threads[t]->th.ompt_thread_info.state*/ + if (thread_handle->ah->kind == ompd_device_kind_cuda) { + if (wait_id) + *wait_id = 0; //TODO: (mr) implement wait_ids in nvptx device rtl + ret = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("state") + .castBase(ompd_type_long_long) + .getValue(*state); + } else { + TValue ompt_thread_info = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("ompt_thread_info") /*__kmp_threads[t]->th.ompt_thread_info*/ + .cast("ompt_thread_info_t"); + if (ompt_thread_info.gotError()) + return ompt_thread_info.getError(); + ret = ompt_thread_info + .access("state") /*__kmp_threads[t]->th.ompt_thread_info.state*/ .castBase() - .getValue(*wait_id); + .getValue(*state); + if (ret != ompd_rc_ok) + return ret; + ret = ompt_thread_info + .access("wait_id") /*__kmp_threads[t]->th.ompt_thread_info.state*/ + .castBase() + .getValue(*wait_id); + } return ret; } @@ -1441,7 +1465,7 @@ ompd_rc_t ompd_get_task_function( return ompd_rc_bad_input; #else ompd_rc_t ret; -#endif +#endif task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; TValue taskInfo; if(task_handle->lwt.address!=0) @@ -1454,7 +1478,7 @@ ompd_rc_t ompd_get_task_function( access("ompt_task_info"). /*td->ompt_task_info*/ cast("ompt_task_info_t"). access("function"). /*td->ompt_task_info.function*/ - castBase(). + castBase(). getValue(task_addr->address); return ret; } diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index 91eaaca67..2c115a980 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" - +#ifdef OMPD_SUPPORT + #include "ompd-specific.h" +#endif /*OMPD_SUPPORT*/ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // template class that encapsulate all the helper functions @@ -205,6 +207,9 @@ public: IsRuntimeUninitialized), GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride)); +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_parallel); +#endif /*OMPD_SUPPORT*/ } //////////////////////////////////////////////////////////////////////////////// @@ -712,6 +717,9 @@ void __kmpc_for_static_init_8u_simple_generic( } EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) { +#ifdef OMP_SUPPORT + ompd_reset_device_thread_state() +#endif PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index ff5f29862..19dcbf52f 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -1,6 +1,6 @@ -#include "ompd-specific.h" #ifdef OMPD_SUPPORT - +#include "ompd-specific.h" +#include "omptarget-nvptx.h" /** * Declaration of symbols to hold struct size and member offset information */ @@ -28,12 +28,11 @@ __device__ __shared__ __device__ void ompd_init ( void ) { - PRINT0(LD_IO, "call to ompd_init\n"); + getMyTopTaskDescriptor()->ompd_thread_info.state = omp_state_undefined; + if (ompd_target_initialized) return; - PRINT0(LD_IO, "ompd_init: initializing\n"); - #define ompd_target_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); OMPD_FOREACH_ACCESS(ompd_target_init_access) #undef ompd_target_init_access @@ -55,6 +54,10 @@ __device__ void ompd_init ( void ) ompd_target_initialized = 1; } +__device__ void ompd_set_device_thread_state(omp_state_t state) { + getMyTopTaskDescriptor()->ompd_thread_info.state = state; +} + __device__ void ompd_bp_parallel_begin (){ asm (""); } __device__ void ompd_bp_parallel_end (){ asm (""); } __device__ void ompd_bp_task_begin (){ asm (""); } diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index cace73475..38cf70e3b 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -3,7 +3,6 @@ #ifdef OMPD_SUPPORT -#include "omptarget-nvptx.h" #include "state-queue.h" #include "option.h" #include @@ -21,10 +20,31 @@ extern "C" __device__ void ompd_bp_task_end ( void ); #define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \ OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext, topTaskDescr) \ + OMPD_ACCESS(omptarget_nvptx_TaskDescr,ompd_thread_info) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,state) #define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \ OMPD_SIZEOF(omptarget_nvptx_ThreadPrivateContext)\ - OMPD_SIZEOF(omptarget_nvptx_TaskDescr) + OMPD_SIZEOF(omptarget_nvptx_TaskDescr) \ + OMPD_SIZEOF(ompd_nvptx_thread_info_t) + + +/* we only support work states for the moment */ +typedef enum { + omp_state_undefined = 0x102, + omp_state_work_serial = 0x000, + omp_state_work_parallel = 0x001 +} omp_state_t; + +__device__ void ompd_set_device_thread_state(omp_state_t); + +INLINE void ompd_reset_device_thread_state() { + ompd_set_device_thread_state(omp_state_undefined); +} + +typedef struct { + uint64_t state; +} ompd_nvptx_thread_info_t; #endif /* OMPD_SUPPORT */ #endif diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index f75b70fff..ebda05654 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -1,31 +1,31 @@ //===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// // -// The LLVM Compiler Infrastructure -// -// This file is dual licensed under the MIT and the University of Illinois Open -// Source Licenses. See LICENSE.txt for details. -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of all library macros, types, -// and functions. -// -//===----------------------------------------------------------------------===// + // The LLVM Compiler Infrastructure + // + // This file is dual licensed under the MIT and the University of Illinois Open + // Source Licenses. See LICENSE.txt for details. + // + //===----------------------------------------------------------------------===// + // + // This file contains the declarations of all library macros, types, + // and functions. + // + //===----------------------------------------------------------------------===// #ifndef __OMPTARGET_NVPTX_H #define __OMPTARGET_NVPTX_H -// std includes + // std includes #include #include #include -// cuda includes + // cuda includes #include #include -// local includes + // local includes #include "counter_group.h" #include "debug.h" // debug #include "interface.h" // interfaces with omp, compiler, and user @@ -36,11 +36,11 @@ #define OMPTARGET_NVPTX_VERSION 1.1 -// used by the library for the interface with the app + // used by the library for the interface with the app #define DISPATCH_FINISHED 0 #define DISPATCH_NOTFINISHED 1 -// used by dynamic scheduling + // used by dynamic scheduling #define FINISHED 0 #define NOT_FINISHED 1 #define LAST_CHUNK 2 @@ -48,390 +48,394 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 -// Macros for Cuda intrinsics -// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. -// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). + // Macros for Cuda intrinsics + // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. + // Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down_sync((mask), (var), (delta), (width)) + __shfl_down_sync((mask), (var), (delta), (width)) #define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate)) #define __ACTIVEMASK() __activemask() #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down((var), (delta), (width)) + __shfl_down((var), (delta), (width)) #define __BALLOT_SYNC(mask, predicate) __ballot((predicate)) #define __ACTIVEMASK() __ballot(1) #endif -// arguments needed for L0 parallelism only. -class omptarget_nvptx_SharedArgs { + // arguments needed for L0 parallelism only. + class omptarget_nvptx_SharedArgs { #if OMPD_SUPPORT - friend void __device__ ompd_init( void ); + friend void __device__ ompd_init( void ); #endif /* OMPD_SUPPORT */ -public: - // All these methods must be called by the master thread only. - INLINE void Init() { - args = buffer; - nArgs = MAX_SHARED_ARGS; - } - INLINE void DeInit() { - // Free any memory allocated for outlined parallel function with a large - // number of arguments. - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, (char *)"new extended args"); - Init(); + public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; } - } - INLINE void EnsureSize(size_t size) { - if (size > nArgs) { + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. if (nArgs > MAX_SHARED_ARGS) { SafeFree(args, (char *)"new extended args"); + Init(); + } + } + INLINE void EnsureSize(size_t size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; } - args = (void **) SafeMalloc(size * sizeof(void *), - (char *)"new extended args"); - nArgs = size; } - } - // Called by all threads. - INLINE void **GetArgs() { return args; }; -private: - // buffer of pre-allocated arguments. - void *buffer[MAX_SHARED_ARGS]; - // pointer to arguments buffer. - // starts off as a pointer to 'buffer' but can be dynamically allocated. - void **args; - // starts off as MAX_SHARED_ARGS but can increase in size. - uint32_t nArgs; -}; - -extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; - -// Data sharing related quantities, need to match what is used in the compiler. -enum DATA_SHARING_SIZES { - // The maximum number of workers in a kernel. - DS_Max_Worker_Threads = 992, - // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, - // The maximum number of warps in use - DS_Max_Warp_Number = 32, -}; - -// Data structure to keep in shared memory that traces the current slot, stack, -// and frame pointer as well as the active threads that didn't exit the current -// environment. -struct DataSharingStateTy { - __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; - void *StackPtr[DS_Max_Warp_Number]; - __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; - int32_t ActiveThreads[DS_Max_Warp_Number]; -}; -// Additional worker slot type which is initialized with the default worker slot -// size of 4*32 bytes. -struct __kmpc_data_sharing_worker_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Worker_Warp_Slot_Size]; -}; -// Additional master slot type which is initialized with the default master slot -// size of 4 bytes. -struct __kmpc_data_sharing_master_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Slot_Size]; -}; -extern __device__ __shared__ DataSharingStateTy DataSharingState; - -//////////////////////////////////////////////////////////////////////////////// -// task ICV and (implicit & explicit) task state - -class omptarget_nvptx_TaskDescr { + // Called by all threads. + INLINE void **GetArgs() { return args; }; + private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; + }; + + extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + + // Data sharing related quantities, need to match what is used in the compiler. + enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, + }; + + // Data structure to keep in shared memory that traces the current slot, stack, + // and frame pointer as well as the active threads that didn't exit the current + // environment. + struct DataSharingStateTy { + __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; + void *StackPtr[DS_Max_Warp_Number]; + __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number]; + void *FramePtr[DS_Max_Warp_Number]; + int32_t ActiveThreads[DS_Max_Warp_Number]; + }; + // Additional worker slot type which is initialized with the default worker slot + // size of 4*32 bytes. + struct __kmpc_data_sharing_worker_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; + }; + // Additional master slot type which is initialized with the default master slot + // size of 4 bytes. + struct __kmpc_data_sharing_master_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Slot_Size]; + }; + extern __device__ __shared__ DataSharingStateTy DataSharingState; + + //////////////////////////////////////////////////////////////////////////////// + // task ICV and (implicit & explicit) task state + + class omptarget_nvptx_TaskDescr { #if OMPD_SUPPORT - friend void __device__ ompd_init( void ); + friend void __device__ ompd_init( void ); + friend void __device__ ompd_set_device_thread_state(omp_state_t state); #endif /* OMPD_SUPPORT */ -public: - // methods for flags - INLINE omp_sched_t GetRuntimeSched(); - INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; } - INLINE void SetDynamic() { - items.flags = items.flags | TaskDescr_IsDynamic; - } - INLINE void ClearDynamic() { - items.flags = items.flags & (~TaskDescr_IsDynamic); - } - INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() { - return items.flags & TaskDescr_InParL2P; - } - INLINE int IsParallelConstruct() { - return items.flags & TaskDescr_IsParConstr; - } - INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } - // methods for other fields - INLINE uint16_t &NThreads() { return items.nthreads; } - INLINE uint16_t &ThreadLimit() { return items.threadlimit; } - INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } - INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } - INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } - INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { - prev = taskDescr; - } - // init & copy - INLINE void InitLevelZeroTaskDescr(); - INLINE void InitLevelOneTaskDescr(uint16_t tnum, - omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, - uint16_t tnum); - INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); - INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, - uint16_t tid, uint16_t tnum); - -private: - // bits for flags: (7 used, 1 free) - // 3 bits (SchedMask) for runtime schedule - // 1 bit (IsDynamic) for dynamic schedule (false = static) - // 1 bit (InPar) if this thread has encountered one or more parallel region - // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) - // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel - // region - static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); - static const uint8_t TaskDescr_IsDynamic = 0x8; - static const uint8_t TaskDescr_InPar = 0x10; - static const uint8_t TaskDescr_IsParConstr = 0x20; - static const uint8_t TaskDescr_InParL2P = 0x40; - - struct TaskDescr_items { - uint8_t flags; // 6 bit used (see flag above) - uint8_t unused; - uint16_t nthreads; // thread num for subsequent parallel regions - uint16_t threadlimit; // thread limit ICV - uint16_t threadId; // thread id - uint16_t threadsInTeam; // threads in current team - uint64_t runtimeChunkSize; // runtime chunk size - } items; - omptarget_nvptx_TaskDescr *prev; -}; - -// build on kmp -typedef struct omptarget_nvptx_ExplicitTaskDescr { - omptarget_nvptx_TaskDescr - taskDescr; // omptarget_nvptx task description (must be first) - kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) -} omptarget_nvptx_ExplicitTaskDescr; - -//////////////////////////////////////////////////////////////////////////////// -// Descriptor of a parallel region (worksharing in general) - -class omptarget_nvptx_WorkDescr { - -public: - // access to data - INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } - INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } - // init - INLINE void InitWorkDescr(); - -private: - omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) - omptarget_nvptx_TaskDescr masterTaskICV; - bool hasCancel; -}; - -//////////////////////////////////////////////////////////////////////////////// - -class omptarget_nvptx_TeamDescr { -public: - // access to data - INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { - return &levelZeroTaskDescr; - } - INLINE omptarget_nvptx_WorkDescr &WorkDescr() { - return workDescrForActiveParallel; - } - INLINE omp_lock_t *CriticalLock() { return &criticalLock; } - INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } - - // init - INLINE void InitTeamDescr(); - - INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { - // If this is invoked by the master thread of the master warp then intialize - // it with a smaller slot. - if (IsMasterThread) { + public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched(); + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; } + INLINE void SetDynamic() { + items.flags = items.flags | TaskDescr_IsDynamic; + } + INLINE void ClearDynamic() { + items.flags = items.flags & (~TaskDescr_IsDynamic); + } + INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() { + return items.flags & TaskDescr_InParL2P; + } + INLINE int IsParallelConstruct() { + return items.flags & TaskDescr_IsParConstr; + } + INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t &NThreads() { return items.nthreads; } + INLINE uint16_t &ThreadLimit() { return items.threadlimit; } + INLINE uint16_t &ThreadId() { return items.threadId; } + INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } + INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + prev = taskDescr; + } + // init & copy + INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelOneTaskDescr(uint16_t tnum, + omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, + uint16_t tnum); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + + private: + // bits for flags: (7 used, 1 free) + // 3 bits (SchedMask) for runtime schedule + // 1 bit (IsDynamic) for dynamic schedule (false = static) + // 1 bit (InPar) if this thread has encountered one or more parallel region + // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel + // region + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_IsDynamic = 0x8; + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + static const uint8_t TaskDescr_InParL2P = 0x40; + + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t nthreads; // thread num for subsequent parallel regions + uint16_t threadlimit; // thread limit ICV + uint16_t threadId; // thread id + uint16_t threadsInTeam; // threads in current team + uint64_t runtimeChunkSize; // runtime chunk size + } items; +#ifdef OMPD_SUPPORT + ompd_nvptx_thread_info_t ompd_thread_info; +#endif + omptarget_nvptx_TaskDescr *prev; + }; + + // build on kmp + typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr + taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) + } omptarget_nvptx_ExplicitTaskDescr; + + //////////////////////////////////////////////////////////////////////////////// + // Descriptor of a parallel region (worksharing in general) + + class omptarget_nvptx_WorkDescr { + + public: + // access to data + INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } + INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + // init + INLINE void InitWorkDescr(); + + private: + omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) + omptarget_nvptx_TaskDescr masterTaskICV; + bool hasCancel; + }; + + //////////////////////////////////////////////////////////////////////////////// + + class omptarget_nvptx_TeamDescr { + public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + return &levelZeroTaskDescr; + } + INLINE omptarget_nvptx_WorkDescr &WorkDescr() { + return workDescrForActiveParallel; + } + INLINE omp_lock_t *CriticalLock() { return &criticalLock; } + INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } + + // init + INLINE void InitTeamDescr(); + + INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { + // If this is invoked by the master thread of the master warp then intialize + // it with a smaller slot. + if (IsMasterThread) { + // Do not initalize this slot again if it has already been initalized. + if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the + // data section. DataEnd is non-inclusive. + master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; + // We currently do not have a next slot. + master_rootS[0].Next = 0; + master_rootS[0].Prev = 0; + master_rootS[0].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&master_rootS[0]; + } // Do not initalize this slot again if it has already been initalized. - if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) + if (worker_rootS[wid].DataEnd == + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) return 0; - // Initialize the pointer to the end of the slot given the size of the - // data section. DataEnd is non-inclusive. - master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; + // Initialize the pointer to the end of the slot given the size of the data + // section. DataEnd is non-inclusive. + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; // We currently do not have a next slot. - master_rootS[0].Next = 0; - master_rootS[0].Prev = 0; - master_rootS[0].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&master_rootS[0]; + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; } - // Do not initalize this slot again if it has already been initalized. - if (worker_rootS[wid].DataEnd == - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) - return 0; - // Initialize the pointer to the end of the slot given the size of the data - // section. DataEnd is non-inclusive. - worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; - // We currently do not have a next slot. - worker_rootS[wid].Next = 0; - worker_rootS[wid].Prev = 0; - worker_rootS[wid].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; - } - -private: - omptarget_nvptx_TaskDescr - levelZeroTaskDescr; // icv for team master initial thread - omptarget_nvptx_WorkDescr - workDescrForActiveParallel; // one, ONLY for the active par - omp_lock_t criticalLock; - uint64_t lastprivateIterBuffer; - - __align__(16) - __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1]; - __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; -}; - -//////////////////////////////////////////////////////////////////////////////// -// thread private data (struct of arrays for better coalescing) -// tid refers here to the global thread id -// do not support multiple concurrent kernel a this time -class omptarget_nvptx_ThreadPrivateContext { + + private: + omptarget_nvptx_TaskDescr + levelZeroTaskDescr; // icv for team master initial thread + omptarget_nvptx_WorkDescr + workDescrForActiveParallel; // one, ONLY for the active par + omp_lock_t criticalLock; + uint64_t lastprivateIterBuffer; + + __align__(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1]; + __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; + }; + + //////////////////////////////////////////////////////////////////////////////// + // thread private data (struct of arrays for better coalescing) + // tid refers here to the global thread id + // do not support multiple concurrent kernel a this time + class omptarget_nvptx_ThreadPrivateContext { #if OMPD_SUPPORT - friend void __device__ ompd_init( void ); + friend void __device__ ompd_init( void ); #endif /* OMPD_SUPPORT */ -public: - // task - INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { - return &levelOneTaskDescr[tid]; - } - INLINE void SetTopLevelTaskDescr(int tid, - omptarget_nvptx_TaskDescr *taskICV) { - topTaskDescr[tid] = taskICV; - } - INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); - // parallel - INLINE uint16_t &NumThreadsForNextParallel(int tid) { - return nextRegion.tnum[tid]; - } - // simd - INLINE uint16_t &SimdLimitForNextSimd(int tid) { - return nextRegion.slim[tid]; - } - // sync - INLINE Counter &Priv(int tid) { return priv[tid]; } - INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } - // schedule (for dispatch) - INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } - INLINE int64_t &Chunk(int tid) { return chunk[tid]; } - INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - // state for dispatch with dyn/guided - INLINE Counter &CurrentEvent(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } - // state for dispatch with static - INLINE Counter &NextLowerBound(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } - - INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } - - INLINE void InitThreadPrivateContext(int tid); - INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } - INLINE uint64_t GetSourceQueue() { return SourceQueue; } - -private: - // team context for this team - omptarget_nvptx_TeamDescr teamContext; - // task ICV for implict threads in the only parallel region - omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; - // pointer where to find the current task ICV (top of the stack) - omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; - union { - // Only one of the two is live at the same time. + public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + return &levelOneTaskDescr[tid]; + } + INLINE void SetTopLevelTaskDescr(int tid, + omptarget_nvptx_TaskDescr *taskICV) { + topTaskDescr[tid] = taskICV; + } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); // parallel - uint16_t tnum[MAX_THREADS_PER_TEAM]; - // simd limit - uint16_t slim[MAX_THREADS_PER_TEAM]; - } nextRegion; - // sync - Counter priv[MAX_THREADS_PER_TEAM]; - // schedule (for dispatch) - kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for - int64_t chunk[MAX_THREADS_PER_TEAM]; - int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; - // state for dispatch with dyn/guided OR static (never use both at a time) - Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; - Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; - // Queue to which this object must be returned. - uint64_t SourceQueue; -}; - -/// Device envrionment data -struct omptarget_device_environmentTy { - int32_t debug_level; -}; - -//////////////////////////////////////////////////////////////////////////////// -// global device envrionment -//////////////////////////////////////////////////////////////////////////////// - -extern __device__ omptarget_device_environmentTy omptarget_device_environment; - -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// global data tables -//////////////////////////////////////////////////////////////////////////////// - -extern __device__ __shared__ - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; -extern __device__ __shared__ uint32_t execution_param; -extern __device__ __shared__ void *ReductionScratchpadPtr; - -//////////////////////////////////////////////////////////////////////////////// -// work function (outlined parallel/simd functions) and arguments. -// needed for L1 parallelism only. -//////////////////////////////////////////////////////////////////////////////// - -typedef void *omptarget_nvptx_WorkFn; -extern volatile __device__ __shared__ omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; - -//////////////////////////////////////////////////////////////////////////////// -// get private data structures -//////////////////////////////////////////////////////////////////////////////// - -INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); -INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); - -//////////////////////////////////////////////////////////////////////////////// -// inlined implementation -//////////////////////////////////////////////////////////////////////////////// + INLINE uint16_t &NumThreadsForNextParallel(int tid) { + return nextRegion.tnum[tid]; + } + // simd + INLINE uint16_t &SimdLimitForNextSimd(int tid) { + return nextRegion.slim[tid]; + } + // sync + INLINE Counter &Priv(int tid) { return priv[tid]; } + INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } + // schedule (for dispatch) + INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + INLINE int64_t &Chunk(int tid) { return chunk[tid]; } + INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + // state for dispatch with dyn/guided + INLINE Counter &CurrentEvent(int tid) { + return currEvent_or_nextLowerBound[tid]; + } + INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } + // state for dispatch with static + INLINE Counter &NextLowerBound(int tid) { + return currEvent_or_nextLowerBound[tid]; + } + INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } + + INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + + INLINE void InitThreadPrivateContext(int tid); + INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } + INLINE uint64_t GetSourceQueue() { return SourceQueue; } + + private: + // team context for this team + omptarget_nvptx_TeamDescr teamContext; + // task ICV for implict threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; + union { + // Only one of the two is live at the same time. + // parallel + uint16_t tnum[MAX_THREADS_PER_TEAM]; + // simd limit + uint16_t slim[MAX_THREADS_PER_TEAM]; + } nextRegion; + // sync + Counter priv[MAX_THREADS_PER_TEAM]; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for + int64_t chunk[MAX_THREADS_PER_TEAM]; + int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; + // state for dispatch with dyn/guided OR static (never use both at a time) + Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; + Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; + // Queue to which this object must be returned. + uint64_t SourceQueue; + }; + + /// Device envrionment data + struct omptarget_device_environmentTy { + int32_t debug_level; + }; + + //////////////////////////////////////////////////////////////////////////////// + // global device envrionment + //////////////////////////////////////////////////////////////////////////////// + + extern __device__ omptarget_device_environmentTy omptarget_device_environment; + + //////////////////////////////////////////////////////////////////////////////// + + //////////////////////////////////////////////////////////////////////////////// + // global data tables + //////////////////////////////////////////////////////////////////////////////// + + extern __device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + extern __device__ __shared__ uint32_t execution_param; + extern __device__ __shared__ void *ReductionScratchpadPtr; + + //////////////////////////////////////////////////////////////////////////////// + // work function (outlined parallel/simd functions) and arguments. + // needed for L1 parallelism only. + //////////////////////////////////////////////////////////////////////////////// + + typedef void *omptarget_nvptx_WorkFn; + extern volatile __device__ __shared__ omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; + + //////////////////////////////////////////////////////////////////////////////// + // get private data structures + //////////////////////////////////////////////////////////////////////////////// + + INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); + INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); + INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); + INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); + + //////////////////////////////////////////////////////////////////////////////// + // inlined implementation + //////////////////////////////////////////////////////////////////////////////// #include "counter_groupi.h" #include "omptarget-nvptxi.h" diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index d4546284f..b6ef81b27 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -34,6 +34,9 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#ifdef OMPD_SUPPORT + #include "ompd-specific.h" +#endif /*OMPD_SUPPORT*/ typedef struct ConvergentSimdJob { omptarget_nvptx_TaskDescr taskDescr; @@ -355,6 +358,9 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn, newTaskDescr->ThreadId(), newTaskDescr->NThreads()); isActive = true; +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_parallel); +#endif /*OMPD_SUPPORT*/ } return isActive; @@ -369,6 +375,9 @@ EXTERN void __kmpc_kernel_end_parallel() { omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ } //////////////////////////////////////////////////////////////////////////////// From e1949d35195440cedb9f91df412b25c9ac62e3c2 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 2 Jul 2018 15:37:02 -0700 Subject: [PATCH 06/64] [OMPD] Further align code to spec * Moved some removed/renamed types out of ompd.h to ompd-private.h (we still need them internally i think) * Removed dead code * Some types have been renamed Still need to rename all callbacks and check for completeness --- libompd/gdb-wrapper/OMPDCommand.cpp | 14 - libompd/src/TargetValue.cpp | 4 +- libompd/src/TargetValue.h | 3 +- libompd/src/omp-debug.cpp | 1148 ++++++++++++--------------- libompd/src/omp-debug.h | 1 + libompd/src/ompd-private.h | 69 ++ libompd/src/ompd.h | 153 +--- libompd/src/ompd_test.c | 2 +- 8 files changed, 600 insertions(+), 794 deletions(-) create mode 100644 libompd/src/ompd-private.h diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 4139e3968..d6ebb2984 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -327,20 +327,6 @@ const char* OMPDLevels::toString() const /* --- OMPDCallback ----------------------------------------------------------- */ -ompd_target_prim_types_t get_prim_type_from_string(const string& str) -{ - const char * names[ompd_type_max] = { - "CHAR", - "SHORT", - "INT", - "LONG", - "LONG_LONG", - "POINTER" - }; - for (int i = 0; 0 const ompd_callbacks_t *TValue::callbacks = NULL; -ompd_target_type_sizes_t TValue::type_sizes; +ompd_device_type_sizes_t TValue::type_sizes; // MARKER_MR: This is just compat stuff because I dont have time to // replace this function. TODO: replace this function @@ -27,6 +27,8 @@ inline int ompd_sizeof(ompd_target_prim_types_t t) { return TValue::type_sizes.sizeof_long_long; case ompd_type_pointer: return TValue::type_sizes.sizeof_pointer; + default: + break; } return 0; } diff --git a/libompd/src/TargetValue.h b/libompd/src/TargetValue.h index cbf8a4f9f..40b61a54e 100644 --- a/libompd/src/TargetValue.h +++ b/libompd/src/TargetValue.h @@ -1,5 +1,6 @@ #include "ompd.h" +#include "ompd-private.h" #include #ifndef SRC_TARGET_VALUE_H_ @@ -100,7 +101,7 @@ class TValue { public: static const ompd_callbacks_t *callbacks; - static ompd_target_type_sizes_t type_sizes; + static ompd_device_type_sizes_t type_sizes; TValue() : errorState(ompd_rc_error) {} /** diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 85aa45346..d772f6043 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -15,7 +15,6 @@ #include "omp-debug.h" #include "omp.h" -#include "ompd.h" // #include #include "TargetValue.h" #include @@ -25,7 +24,7 @@ #include #include -ompd_target_type_sizes_t type_sizes; +ompd_device_type_sizes_t type_sizes; uint64_t ompd_state; /* --- OMPD functions ------------------------------------------------------- */ @@ -57,21 +56,6 @@ ompd_process_initialize(ompd_address_space_context_t ompd_rc_t ret = initTypeSizes(context); if (ret != ompd_rc_ok) return ret; -#if 0 - ret = TValue(context, "ompd_rtl_version") - .castBase(ompd_type_int) - .getValue(rtl_version); - if ((ret == ompd_rc_ok && rtl_version < 5) || - ret == ompd_rc_target_read_error) - return ompd_rc_incompatible; - if (ret != ompd_rc_ok) - return ret; - ret = TValue(context, "ompd_state") - .castBase(ompd_type_long_long) - .getValue(ompd_state); - if (ret != ompd_rc_ok) - return ret; -#endif *addrhandle = new ompd_address_space_handle_t; if (!addrhandle) return ompd_rc_error; @@ -241,53 +225,93 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, return ompd_rc_stale_handle; *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; return ompd_rc_ok; - } +} -#if 0 - ompd_rc_t ompd_get_thread_handle_string_id ( - ompd_thread_handle_t *thread_handle, - char **string_id - ) - { - pthread_t thread_id; - ompd_rc_t ret; - ret = ompd_get_thread_id(thread_handle, ompd_thread_id_pthread, sizeof(pthread_t), &thread_id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)thread_id); - return ompd_rc_ok; - } -#endif +/* --- 4.2 Parallel Region Handles------------------------------------------- */ - /* --- 4.2 Parallel Region Handles------------------------------------------- */ +/* parallel_handle is of type (kmp_base_team_t)*/ - /* parallel_handle is of type (kmp_base_team_t)*/ +ompd_rc_t ompd_get_current_parallel_handle( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_parallel_handle_t **parallel_handle /* OUT: OpenMP parallel handle */ + ) { + if (!thread_handle) + return ompd_rc_stale_handle; + if (!thread_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = thread_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - ompd_rc_t ompd_get_current_parallel_handle( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_parallel_handle_t **parallel_handle /* OUT: OpenMP parallel handle */ - ) { - if (!thread_handle) - return ompd_rc_stale_handle; - if (!thread_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr, lwt; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; + TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_team") /*__kmp_threads[t]->th.th_team*/ + .cast("kmp_team_p", 1) + .access("t"); /*__kmp_threads[t]->th.th_team->t*/ + + ompd_rc_t ret = teamdata.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = teamdata.cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + + ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + + (*parallel_handle)->ah = thread_handle->ah; + (*parallel_handle)->th = taddr; + (*parallel_handle)->lwt = lwt; + return ompd_rc_ok; +} + +ompd_rc_t ompd_get_enclosing_parallel_handle( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_parallel_handle_t * + *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr = parallel_handle->th, lwt; + + ompd_rc_t ret = ompd_rc_stale_handle; + TValue lwtValue = TValue(context, parallel_handle->lwt); + if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 + { // if we are in lwt, get parent + ret = lwtValue.cast("ompt_lw_taskteam_t", 0) + .access("parent") + .cast("ompt_lw_taskteam_t", 1) + .dereference() + .getAddress(&lwt); + } + if (ret != ompd_rc_ok) { // no lwt or parent==0x0 - TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_team") /*__kmp_threads[t]->th.th_team*/ - .cast("kmp_team_p", 1) - .access("t"); /*__kmp_threads[t]->th.th_team->t*/ + TValue teamdata = + TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_parent") /*t.t_parent*/ + .cast("kmp_team_p", 1) + .access("t"); /*t.t_parent->t*/ - ompd_rc_t ret = teamdata.getAddress(&taddr); + ret = teamdata.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; @@ -298,188 +322,169 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, .getValue(lwt.address); if (ret != ompd_rc_ok) return ret; + } - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(parallel_handle)); - if (ret != ompd_rc_ok) - return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(enclosing_parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + (*enclosing_parallel_handle)->th = taddr; + (*enclosing_parallel_handle)->lwt = lwt; + (*enclosing_parallel_handle)->ah = parallel_handle->ah; + return ompd_rc_ok; +} - (*parallel_handle)->ah = thread_handle->ah; - (*parallel_handle)->th = taddr; - (*parallel_handle)->lwt = lwt; - return ompd_rc_ok; - } +ompd_rc_t ompd_get_task_parallel_handle( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_parallel_handle_t * + *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; - ompd_rc_t ompd_get_enclosing_parallel_handle( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_parallel_handle_t * - *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - if (!context) - return ompd_rc_stale_handle; + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = parallel_handle->th, lwt; - - ompd_rc_t ret = ompd_rc_stale_handle; - TValue lwtValue = TValue(context, parallel_handle->lwt); - if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 - { // if we are in lwt, get parent - ret = lwtValue.cast("ompt_lw_taskteam_t", 0) - .access("parent") - .cast("ompt_lw_taskteam_t", 1) - .dereference() - .getAddress(&lwt); - } - if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + ompd_rc_t ret; + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .getAddress(&taddr); - TValue teamdata = - TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_parent") /*t.t_parent*/ - .cast("kmp_team_p", 1) - .access("t"); /*t.t_parent->t*/ - - ret = teamdata.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = teamdata.cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; - } + if (ret != ompd_rc_ok) + return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(enclosing_parallel_handle)); - if (ret != ompd_rc_ok) - return ret; - (*enclosing_parallel_handle)->th = taddr; - (*enclosing_parallel_handle)->lwt = lwt; - (*enclosing_parallel_handle)->ah = parallel_handle->ah; - return ompd_rc_ok; - } + ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(enclosing_parallel_handle)); + if (ret != ompd_rc_ok) + return ret; - ompd_rc_t ompd_get_task_parallel_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_parallel_handle_t * - *enclosing_parallel_handle /* OUT: OpenMP parallel handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; + (*enclosing_parallel_handle)->ah = task_handle->ah; + (*enclosing_parallel_handle)->lwt = task_handle->lwt; + (*enclosing_parallel_handle)->th = taddr; + return ompd_rc_ok; +} - if (!context) - return ompd_rc_stale_handle; +ompd_rc_t ompd_release_parallel_handle( + ompd_parallel_handle_t *parallel_handle /* IN: OpenMP parallel handle */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + return ompd_rc_ok; +} - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; +ompd_rc_t +ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, + ompd_parallel_handle_t *parallel_handle_2, + int *cmp_value) { + if (!parallel_handle_1) + return ompd_rc_stale_handle; + if (!parallel_handle_2) + return ompd_rc_stale_handle; + if (parallel_handle_1->th.address - parallel_handle_2->th.address) + *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; + else + *cmp_value = + parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; + return ompd_rc_ok; +} - ompd_rc_t ret; - ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .getAddress(&taddr); +/* --- 4.3 Task Handles ----------------------------------------------------- */ - if (ret != ompd_rc_ok) - return ret; +/* task_handle is of type (kmp_taskdata_t) */ - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(enclosing_parallel_handle)); - if (ret != ompd_rc_ok) - return ret; +ompd_rc_t ompd_get_current_task_handle( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ + ) { + if (!thread_handle) + return ompd_rc_stale_handle; + if (!thread_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = thread_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - (*enclosing_parallel_handle)->ah = task_handle->ah; - (*enclosing_parallel_handle)->lwt = task_handle->lwt; - (*enclosing_parallel_handle)->th = taddr; - return ompd_rc_ok; - } + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr, lwt; - ompd_rc_t ompd_release_parallel_handle( - ompd_parallel_handle_t *parallel_handle /* IN: OpenMP parallel handle */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle)); - if (ret != ompd_rc_ok) - return ret; - return ompd_rc_ok; - } + TValue taskdata = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ + .cast("kmp_taskdata_t", 1); - ompd_rc_t - ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, - ompd_parallel_handle_t *parallel_handle_2, - int *cmp_value) { - if (!parallel_handle_1) - return ompd_rc_stale_handle; - if (!parallel_handle_2) - return ompd_rc_stale_handle; - if (parallel_handle_1->th.address - parallel_handle_2->th.address) - *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; - else - *cmp_value = - parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; - return ompd_rc_ok; - } + ompd_rc_t ret = taskdata.dereference().getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; -#if 0 // parallel-id is initialized to zero - ompd_rc_t ompd_get_parallel_handle_string_id ( - ompd_parallel_handle_t *parallel_handle, - char **string_id - ) - { - ompd_parallel_id_t id; - ompd_rc_t ret; - ret = ompd_get_parallel_id(parallel_handle, &id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)id); - return ompd_rc_ok; - } -#endif + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = taskdata + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; - /* --- 4.3 Task Handles ----------------------------------------------------- */ + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(task_handle)); + if (ret != ompd_rc_ok) + return ret; - /* task_handle is of type (kmp_taskdata_t) */ + (*task_handle)->th = taddr; + (*task_handle)->lwt = lwt; + (*task_handle)->ah = thread_handle->ah; + return ompd_rc_ok; +} - ompd_rc_t ompd_get_current_task_handle( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ - ) { - if (!thread_handle) - return ompd_rc_stale_handle; - if (!thread_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; +ompd_rc_t ompd_get_generating_task_handle( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr = task_handle->th, lwt; + + ompd_rc_t ret = ompd_rc_stale_handle; + TValue lwtValue = TValue(context, task_handle->lwt); + if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 + { // if we are in lwt, get parent + ret = lwtValue.cast("ompt_lw_taskteam_t", 0) + .access("parent") + .cast("ompt_lw_taskteam_t", 1) + .dereference() + .getAddress(&lwt); + } + if (ret != ompd_rc_ok) { // no lwt or parent==0x0 - TValue taskdata = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ - .cast("kmp_taskdata_t", 1); + TValue taskdata = TValue(context, task_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_taskdata_t") /*td*/ + .access("td_parent") /*td->td_parent*/ + .cast("kmp_taskdata_t", 1); - ompd_rc_t ret = taskdata.dereference().getAddress(&taddr); + ret = taskdata.dereference().getAddress(&taddr); if (ret != ompd_rc_ok) return ret; @@ -494,485 +499,374 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, .getValue(lwt.address); if (ret != ompd_rc_ok) return ret; + } - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(task_handle)); - if (ret != ompd_rc_ok) - return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(parent_task_handle)); + if (ret != ompd_rc_ok) + return ret; - (*task_handle)->th = taddr; - (*task_handle)->lwt = lwt; - (*task_handle)->ah = thread_handle->ah; - return ompd_rc_ok; - } + (*parent_task_handle)->th = taddr; + (*parent_task_handle)->lwt = lwt; + (*parent_task_handle)->ah = task_handle->ah; + return ret; +} - ompd_rc_t ompd_get_generating_task_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; +ompd_rc_t ompd_get_scheduling_task_handle( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ + ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = task_handle->th, lwt; - - ompd_rc_t ret = ompd_rc_stale_handle; - TValue lwtValue = TValue(context, task_handle->lwt); - if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 - { // if we are in lwt, get parent - ret = lwtValue.cast("ompt_lw_taskteam_t", 0) - .access("parent") - .cast("ompt_lw_taskteam_t", 1) - .dereference() - .getAddress(&lwt); - } - if (ret != ompd_rc_ok) { // no lwt or parent==0x0 - - TValue taskdata = TValue(context, task_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_taskdata_t") /*td*/ - .access("td_parent") /*td->td_parent*/ - .cast("kmp_taskdata_t", 1); - - ret = taskdata.dereference().getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = taskdata - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; - } + assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(parent_task_handle)); - if (ret != ompd_rc_ok) - return ret; + ompd_rc_t ret = + TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("ompt_task_info") // td->ompt_task_info + .cast("ompt_task_info_t") + .access("scheduling_parent") // td->ompd_task_info.scheduling_parent + .cast("kmp_taskdata_t", 1) + .dereference() + .getAddress(&taddr); - (*parent_task_handle)->th = taddr; - (*parent_task_handle)->lwt = lwt; - (*parent_task_handle)->ah = task_handle->ah; + if (ret != ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(parent_task_handle)); + if (ret != ompd_rc_ok) return ret; - } - ompd_rc_t ompd_get_scheduling_task_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + (*parent_task_handle)->th = taddr; + (*parent_task_handle)->ah = task_handle->ah; + return ret; +} - assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; +ompd_rc_t ompd_get_task_in_parallel( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + int nth_handle, /* OUT: number of the task handle */ + ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ + ) { + int i; + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - ompd_rc_t ret = - TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("ompt_task_info") // td->ompt_task_info - .cast("ompt_task_info_t") - .access("scheduling_parent") // td->ompd_task_info.scheduling_parent + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret; + ompd_address_t taddr; + ret = TValue(context, parallel_handle->th) /* t */ + .cast("kmp_base_team_t", 0) + .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ .cast("kmp_taskdata_t", 1) - .dereference() + .getArrayElement( + nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ .getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(parent_task_handle)); - if (ret != ompd_rc_ok) - return ret; - - (*parent_task_handle)->th = taddr; - (*parent_task_handle)->ah = task_handle->ah; + if (ret != ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + (void **)(task_handle)); + if (ret != ompd_rc_ok) return ret; - } - - ompd_rc_t ompd_get_task_in_parallel( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - int nth_handle, /* OUT: number of the task handle */ - ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ - ) { - int i; - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret; - ompd_address_t taddr; - ret = TValue(context, parallel_handle->th) /* t */ - .cast("kmp_base_team_t", 0) - .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ - .cast("kmp_taskdata_t", 1) - .getArrayElement( - nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ - .getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), - (void **)(task_handle)); - if (ret != ompd_rc_ok) - return ret; + (*task_handle)->th = taddr; + (*task_handle)->ah = parallel_handle->ah; + return ret; +} - (*task_handle)->th = taddr; - (*task_handle)->ah = parallel_handle->ah; +ompd_rc_t ompd_release_task_handle( + ompd_task_handle_t *task_handle /* IN: OpenMP task handle */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle)); + if (ret != ompd_rc_ok) return ret; - } + return ompd_rc_ok; +} - ompd_rc_t ompd_release_task_handle( - ompd_task_handle_t *task_handle /* IN: OpenMP task handle */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle)); - if (ret != ompd_rc_ok) - return ret; - return ompd_rc_ok; - } +ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, + ompd_task_handle_t *task_handle_2, + int *cmp_value) { + if (!task_handle_1) + return ompd_rc_stale_handle; + if (!task_handle_2) + return ompd_rc_stale_handle; + if (task_handle_1->th.address - task_handle_2->th.address) + *cmp_value = task_handle_1->th.address - task_handle_2->th.address; + else + *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; + return ompd_rc_ok; +} - ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, - ompd_task_handle_t *task_handle_2, - int *cmp_value) { - if (!task_handle_1) - return ompd_rc_stale_handle; - if (!task_handle_2) - return ompd_rc_stale_handle; - if (task_handle_1->th.address - task_handle_2->th.address) - *cmp_value = task_handle_1->th.address - task_handle_2->th.address; - else - *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; - return ompd_rc_ok; - } -#if 0 // all task ids are initialized to zero - ompd_rc_t ompd_get_task_handle_string_id ( - ompd_task_handle_t *task_handle, - char **string_id - ) - { - ompd_task_id_t id; - ompd_rc_t ret = ompd_get_task_id(task_handle, &id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)id); - return ompd_rc_ok; - } -#endif +/* --- 5 Process and Thread Settings ---------------------------------------- */ - /* --- 5 Process and Thread Settings ---------------------------------------- */ +ompd_rc_t +ompd_get_num_procs(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: number of processes */ + ) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; - ompd_rc_t - ompd_get_num_procs(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; + if (!context) + return ompd_rc_stale_handle; - if (!context) - return ompd_rc_stale_handle; + assert(callbacks && "Callback table not initialized!"); - assert(callbacks && "Callback table not initialized!"); + int nth; + ret = TValue(context, "__kmp_avail_proc") + .castBase("__kmp_avail_proc") + .getValue(nth); + *val = nth; + return ret; +} - int nth; - ret = TValue(context, "__kmp_avail_proc") - .castBase("__kmp_avail_proc") - .getValue(nth); - *val = nth; - return ret; - } +ompd_rc_t +ompd_get_thread_limit(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; - ompd_rc_t - ompd_get_thread_limit(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; + if (!context) + return ompd_rc_stale_handle; - if (!context) - return ompd_rc_stale_handle; + assert(callbacks && "Callback table not initialized!"); - assert(callbacks && "Callback table not initialized!"); + int nth; + ret = + TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); + *val = nth; + return ret; +} - int nth; - ret = - TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); - *val = nth; - return ret; - } +/* --- 6 Parallel Region Inqueries ------------------------------------------ */ +/* --- 6.1 Settings --------------------------------------------------------- */ - /* --- 6 Parallel Region Inqueries ------------------------------------------ */ - /* --- 6.1 Settings --------------------------------------------------------- */ +ompd_rc_t ompd_get_num_threads( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: number of threads */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - ompd_rc_t ompd_get_num_threads( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + assert(callbacks && "Callback table not initialized!"); - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = ompd_rc_ok; - if (parallel_handle->lwt.address != 0) - *val = 1; - else { - uint32_t res; - ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_nproc") /*t.t_nproc*/ - .castBase() - .getValue(res); - *val = res; - } - return ret; + ompd_rc_t ret = ompd_rc_ok; + if (parallel_handle->lwt.address != 0) + *val = 1; + else { + uint32_t res; + ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_nproc") /*t.t_nproc*/ + .castBase() + .getValue(res); + *val = res; } + return ret; +} - ompd_rc_t ompd_get_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; +ompd_rc_t ompd_get_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: nesting level */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); + assert(callbacks && "Callback table not initialized!"); - uint32_t res; + uint32_t res; - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_level") /*t.t_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; - } + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_level") /*t.t_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; +} - ompd_rc_t ompd_get_active_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; +ompd_rc_t ompd_get_active_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: active nesting level */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; - assert(callbacks && "Callback table not initialized!"); + assert(callbacks && "Callback table not initialized!"); - uint32_t res; + uint32_t res; - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_active_level") /*t.t_active_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; - } + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_active_level") /*t.t_active_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; +} - /* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ +/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ - ompd_rc_t ompd_get_parallel_data( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *data /* OUT: OpenMP parallel id */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; +ompd_rc_t ompd_get_parallel_data( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_address_t *data /* OUT: OpenMP parallel id */ + ) { + if (!parallel_handle) + return ompd_rc_stale_handle; + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; #if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; #endif - assert(callbacks && "Callback table not initialized!"); - - TValue teamInfo; - if (parallel_handle->lwt.address != 0) - teamInfo = TValue(context, parallel_handle->lwt) - .cast("ompt_lw_taskteam_t", 0); /*lwt*/ - else - teamInfo = - TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/ - ompd_rc_t ret = teamInfo - .access("ompt_team_info") /*t.ompt_team_info*/ - .cast("ompt_team_info_t", 0) - .access("parallel_data") /*t.ompt_team_info.parallel_id*/ - .getAddress(data); - return ret; - } - -#if 0 // there is no such thing as a parallel function - ompd_rc_t ompd_get_parallel_function( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */ - ) - { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - if (!ompd_state) - return ompd_rc_needs_state_tracking; - - assert(callbacks && "Callback table not initialized!"); - parallel_addr->segment = OMPD_SEGMENT_UNSPECIFIED; - - TValue teamInfo; - if(parallel_handle->lwt.address!=0) - teamInfo = TValue(context, parallel_handle->lwt). - cast("ompt_lw_taskteam_t",0); /*lwt*/ - else - teamInfo = TValue(context, parallel_handle->th). - cast("kmp_base_team_t",0); /*t*/ - ompd_rc_t ret = teamInfo. - access("ompt_team_info"). /*t.ompt_team_info*/ - cast("ompt_team_info_t",0). - access("microtask"). /*t.ompt_team_info.microtask*/ - castBase(). - getValue(parallel_addr->address); - return ret; - } -#endif // no parallel function + assert(callbacks && "Callback table not initialized!"); - /* --- 7 Thread Inquiry ----------------------------------------------------- */ + TValue teamInfo; + if (parallel_handle->lwt.address != 0) + teamInfo = TValue(context, parallel_handle->lwt) + .cast("ompt_lw_taskteam_t", 0); /*lwt*/ + else + teamInfo = + TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/ + ompd_rc_t ret = teamInfo + .access("ompt_team_info") /*t.ompt_team_info*/ + .cast("ompt_team_info_t", 0) + .access("parallel_data") /*t.ompt_team_info.parallel_id*/ + .getAddress(data); + return ret; +} - /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ - ompd_rc_t - ompd_get_thread_handle(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, - ompd_size_t sizeof_thread_id, const void *thread_id, - ompd_thread_handle_t **thread_handle) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; +/* --- 7 Thread Inquiry ----------------------------------------------------- */ - if (!context) - return ompd_rc_stale_handle; +/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ - assert(callbacks && "Callback table not initialized!"); - ompd_thread_context_t *tcontext; - ret = callbacks->get_thread_context_for_thread_id( - context, kind, sizeof_thread_id, thread_id, &tcontext); - if (ret != ompd_rc_ok) - return ret; +ompd_rc_t +ompd_get_thread_handle(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_kind_t kind, + ompd_size_t sizeof_thread_id, const void *thread_id, + ompd_thread_handle_t **thread_handle) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; - int tId; + if (!context) + return ompd_rc_stale_handle; - if (kind == ompd_thread_id_cudalogical) { - ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; + assert(callbacks && "Callback table not initialized!"); + ompd_thread_context_t *tcontext; + ret = callbacks->get_thread_context_for_thread_id( + context, kind, sizeof_thread_id, thread_id, &tcontext); + if (ret != ompd_rc_ok) + return ret; - // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->items.threadId - TValue th = TValue(context, tcontext, - "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED) - .cast("omptarget_nvptx_ThreadPrivateContext", 1, - OMPD_SEGMENT_CUDA_PTX_SHARED) - .access("topTaskDescr") - .cast("omptarget_nvptx_TaskDescr", 1, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .getArrayElement(p->threadIdx.x); + int tId; - ompd_address_t taddr; - ret = th.getAddress(&taddr); + if (kind == ompd_thread_id_cudalogical) { + ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; - if (ret != ompd_rc_ok) - return ret; + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->items.threadId + TValue th = TValue(context, tcontext, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("topTaskDescr") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getArrayElement(p->threadIdx.x); - ret = th.access("items__threadId") - .castBase(ompd_type_short) - .getValue(tId); + ompd_address_t taddr; + ret = th.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; - if (tId != p->threadIdx.x) - return ompd_rc_stale_handle; + ret = th.access("items__threadId") + .castBase(ompd_type_short) + .getValue(tId); - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), - (void **)(thread_handle)); - if (ret != ompd_rc_ok) - return ret; + if (ret != ompd_rc_ok) + return ret; - (*thread_handle)->ah = addr_handle; - (*thread_handle)->th = taddr; - } else { - ret = TValue(context, tcontext, "__kmp_gtid") - .castBase("__kmp_gtid") - .getValue(tId); - if (ret != ompd_rc_ok) - return ret; + if (tId != p->threadIdx.x) + return ompd_rc_stale_handle; - if (tId < 0) // thread is no omp worker - return ompd_rc_unavailable; + ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + (void **)(thread_handle)); + if (ret != ompd_rc_ok) + return ret; - TValue th = TValue(context, "__kmp_threads") // __kmp_threads - .cast("kmp_info_t", 2) - .getArrayElement(tId) /*__kmp_threads[t]*/ - .access("th"); /*__kmp_threads[t]->th*/ + (*thread_handle)->ah = addr_handle; + (*thread_handle)->th = taddr; +} else { + ret = TValue(context, tcontext, "__kmp_gtid") + .castBase("__kmp_gtid") + .getValue(tId); + if (ret != ompd_rc_ok) + return ret; - ompd_address_t taddr; - ret = th.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), - (void **)(thread_handle)); - if (ret != ompd_rc_ok) - return ret; - (*thread_handle)->ah = addr_handle; - (*thread_handle)->th = taddr; + if (tId < 0) // thread is no omp worker + return ompd_rc_unavailable; + + TValue th = TValue(context, "__kmp_threads") // __kmp_threads + .cast("kmp_info_t", 2) + .getArrayElement(tId) /*__kmp_threads[t]*/ + .access("th"); /*__kmp_threads[t]->th*/ + + ompd_address_t taddr; + ret = th.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + (void **)(thread_handle)); + if (ret != ompd_rc_ok) + return ret; + (*thread_handle)->ah = addr_handle; + (*thread_handle)->th = taddr; #ifndef NDEBUG if (ret != ompd_rc_ok) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index a4cd8f785..b4fdff0b1 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -32,6 +32,7 @@ extern "C" { #define STR(x) STR_HELPER(x) #include "ompd.h" +#include "ompd-private.h" /****************************************************************************** * General helper functions diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h new file mode 100644 index 000000000..b54275d91 --- /dev/null +++ b/libompd/src/ompd-private.h @@ -0,0 +1,69 @@ +#ifndef SRC_OMPD_PRIVATE_H_ +#define SRC_OMPD_PRIVATE_H_ + + +/* + * Definition of OMPD states, taken from OMPT + */ +#define FOREACH_OMP_STATE(macro) \ + \ + /* first available state */ \ + macro (omp_state_undefined, 0x102) /* undefined thread state */ \ + \ + /* work states (0..15) */ \ + macro (omp_state_work_serial, 0x000) /* working outside parallel */ \ + macro (omp_state_work_parallel, 0x001) /* working within parallel */ \ + macro (omp_state_work_reduction, 0x002) /* performing a reduction */ \ + \ + /* barrier wait states (16..31) */ \ + macro (omp_state_wait_barrier, 0x010) /* waiting at a barrier */ \ + macro (omp_state_wait_barrier_implicit_parallel, 0x011) \ + /* implicit barrier at the end of parallel region */\ + macro (omp_state_wait_barrier_implicit_workshare, 0x012) \ + /* implicit barrier at the end of worksharing */ \ + macro (omp_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \ + macro (omp_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \ + \ + /* task wait states (32..63) */ \ + macro (omp_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \ + macro (omp_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \ + \ + /* mutex wait states (64..127) */ \ + macro (omp_state_wait_mutex, 0x040) \ + macro (omp_state_wait_lock, 0x041) /* waiting for lock */ \ + macro (omp_state_wait_critical, 0x042) /* waiting for critical */ \ + macro (omp_state_wait_atomic, 0x043) /* waiting for atomic */ \ + macro (omp_state_wait_ordered, 0x044) /* waiting for ordered */ \ + \ + /* target wait states (128..255) */ \ + macro (omp_state_wait_target, 0x080) /* waiting for target region */ \ + macro (omp_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \ + macro (omp_state_wait_target_update, 0x082) /* waiting for target update operation */ \ + \ + /* misc (256..511) */ \ + macro (omp_state_idle, 0x100) /* waiting for work */ \ + macro (omp_state_overhead, 0x101) /* overhead excluding wait states */ \ + \ + /* implementation-specific states (512..) */ + +typedef enum omp_state_t { +#define ompd_state_macro(state, code) state = code, + FOREACH_OMP_STATE(ompd_state_macro) +#undef ompd_state_macro +} omp_state_t; + +/** + * Primitive types. + */ +typedef enum ompd_target_prim_types_t { + ompd_type_invalid = -1, + ompd_type_char = 0, + ompd_type_short = 1, + ompd_type_int = 2, + ompd_type_long = 3, + ompd_type_long_long = 4, + ompd_type_pointer = 5, + ompd_type_max +} ompd_target_prim_types_t; + +#endif /*SRC_OMPD_PRIVATE_H*/ diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 0fdd1c2d9..8adb54fd0 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -21,10 +21,6 @@ * - Type entities end with the suffix "_t" (for type) * - Function types end with the suffix "_fn_t" (for function type) * - Return code entities have "_rc_" in it - * - Abstractions referring to the target have the prefix "t" (e.g., - * "tmemory" for memory in the target, or "tsymbol" for symbol in the target) - * - Abstractions referring to the debugger have the prefix "d" (e.g., - * "dmemory" for memory in the debugger) * * Comment conventions: * - Input function parameters denoted by "IN:" @@ -85,71 +81,6 @@ typedef struct ompd_address_t { #define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) #define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) -//#if 0 // types removed in Austin F2F -/* - * Definition of OMPD states, taken from OMPT - */ -#define FOREACH_OMPD_STATE(macro) \ - \ - /* first */ \ - macro(ompd_state_first, 0x71) /* initial enumeration state */ \ - \ - /* work states (0..15) */ \ - macro(ompd_state_work_serial, 0x00) /* working outside parallel */ \ - macro(ompd_state_work_parallel, 0x01) /* working within parallel */ \ - macro(ompd_state_work_reduction, 0x02) /* performing a reduction */ \ - \ - /* idle (16..31) */ \ - macro(ompd_state_idle, 0x10) /* waiting for work */ \ - \ - /* overhead states (32..63) */ \ - macro(ompd_state_overhead, 0x20) /* overhead excluding wait states */ \ - \ - /* barrier wait states (64..79) */ \ - macro(ompd_state_wait_barrier, 0x40) /* waiting at a barrier */ \ - macro(ompd_state_wait_barrier_implicit, 0x41) /* implicit barrier */ \ - macro(ompd_state_wait_barrier_explicit, 0x42) /* explicit barrier */ \ - \ - /* task wait states (80..95) */ \ - macro(ompd_state_wait_taskwait, 0x50) /* waiting at a taskwait */ \ - macro(ompd_state_wait_taskgroup, 0x51) /* waiting at a taskgroup */ \ - \ - /* mutex wait states (96..111) */ \ - macro(ompd_state_wait_lock, 0x60) /* waiting for lock */ \ - macro(ompd_state_wait_nest_lock, 0x61) /* waiting for nest lock */ \ - macro(ompd_state_wait_critical, 0x62) /* waiting for critical */ \ - macro(ompd_state_wait_atomic, 0x63) /* waiting for atomic */ \ - macro(ompd_state_wait_ordered, 0x64) /* waiting for ordered */ \ - macro(ompd_state_wait_single, \ - 0x6F) /* waiting for single region (non-standard!) */ \ - \ - /* misc (112..127) */ \ - macro(ompd_state_undefined, 0x70) /* undefined thread state */ - -typedef enum ompd_state_t { -#define ompd_state_macro(state, code) state = code, - FOREACH_OMPD_STATE(ompd_state_macro) -#undef ompd_state_macro -} ompd_state_t; - -#if 0 -typedef enum ompd_sched_t { - ompd_sched_static = 1, - ompd_sched_dynamic = 2, - ompd_sched_guided = 3, - ompd_sched_auto = 4, - ompd_sched_vendor_lo = 5, - ompd_sched_vendor_hi = 0x7fffffff -} ompd_sched_t; - -typedef enum ompd_proc_bind_t { - ompd_proc_bind_false = 0, - ompd_proc_bind_true = 1, - ompd_proc_bind_master = 2, - ompd_proc_bind_close = 3, - ompd_proc_bind_spread = 4 -} ompd_proc_bind_t; -#endif typedef uint64_t ompd_device_identifier_t; @@ -241,26 +172,12 @@ typedef enum ompd_rc_t { ompd_rc_nomem = 10 /* unable to allocate memory */ } ompd_rc_t; -/** - * Primitive types. - */ -typedef enum ompd_target_prim_types_t { - ompd_type_invalid = -1, - ompd_type_char = 0, - ompd_type_short = 1, - ompd_type_int = 2, - ompd_type_long = 3, - ompd_type_long_long = 4, - ompd_type_pointer = 5, - ompd_type_max -} ompd_target_prim_types_t; - /** * Primitive type sizes. * These types are used by OMPD to interrogate the debugger about the size of * primitive types in the target. */ -typedef struct ompd_target_type_sizes_t { +typedef struct ompd_device_type_sizes_t { uint8_t sizeof_char; uint8_t sizeof_short; uint8_t sizeof_int; @@ -301,25 +218,13 @@ typedef ompd_rc_t (*ompd_get_thread_context_for_thread_id_fn_t)( ompd_size_t sizeof_thread_id, const void *thread_id, ompd_thread_context_t **thread_context); -#if 0 -/** - * Get containing (host) process context for address_space_context - */ -typedef ompd_rc_t (*ompd_get_process_context_for_context_fn_t) ( - ompd_address_space_context_t* - address_space_context, /* IN: OMP device/process addr space */ - ompd_address_space_context_t** - containing_address_space_context /* OUT: Containing omp process addr space */ -); -#endif - /** * Look up the sizes of primitive types in the target */ typedef ompd_rc_t (*ompd_tsizeof_prim_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ - ompd_target_type_sizes_t *sizes /* OUT: type sizes */ + ompd_device_type_sizes_t *sizes /* OUT: type sizes */ ); /** @@ -462,6 +367,7 @@ ompd_rc_t ompd_device_initialize( ); ompd_rc_t ompd_finalize(void); + /* --- 4 Handle Management -------------------------------------------------- */ /* --- 4.1 Thread Handles --------------------------------------------------- */ @@ -497,25 +403,12 @@ ompd_rc_t ompd_get_thread_in_parallel( ompd_thread_handle_t **thread_handle /* OUT: handle */ ); -#if 0 -ompd_rc_t ompd_get_master_thread_in_parallel ( - ompd_parallel_handle_t *parallel_handle, /* IN */ - ompd_thread_handle_t **thread_handle); -#endif - ompd_rc_t ompd_release_thread_handle(ompd_thread_handle_t *thread_handle); ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, ompd_thread_handle_t *thread_handle_2, int *cmp_value); -#if 0 -ompd_rc_t ompd_get_thread_handle_string_id ( - ompd_thread_handle_t *thread_handle, - char **string_id -); -#endif - /* --- 4.2 Parallel Region Handles------------------------------------------- */ /** @@ -569,13 +462,6 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, ompd_parallel_handle_t *parallel_handle_2, int *cmp_value); -#if 0 -ompd_rc_t ompd_get_parallel_handle_string_id ( - ompd_parallel_handle_t *parallel_handle, - char **string_id -); -#endif - /* --- 4.3 Task Handles ----------------------------------------------------- */ /** @@ -599,12 +485,6 @@ ompd_rc_t ompd_get_current_task_handle( * meaningful only if the thread executing the task specified by task_handle is * stopped. */ -#if 0 -ompd_rc_t ompd_get_ancestor_task_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ); -#endif ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ @@ -636,13 +516,6 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, ompd_task_handle_t *task_handle_2, int *cmp_value); -#if 0 -ompd_rc_t ompd_get_task_handle_string_id ( - ompd_task_handle_t *task_handle, - char **string_id -); -#endif - /* --- 5o Process and Thread Settings ---------------------------------------- */ @@ -707,13 +580,6 @@ ompd_rc_t ompd_get_parallel_data( ompd_address_t *data /* OUT: OpenMP parallel id */ ); -#if 0 -ompd_rc_t ompd_get_parallel_function( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */ - ); -#endif - /* --- 7 Thread Inquiry ----------------------------------------------------- */ /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ @@ -771,19 +637,6 @@ ompd_rc_t ompd_get_state( /* --- 8 Task Inquiry ------------------------------------------------------- */ -/* --- 8.1 Task Function Entry Point ---------------------------------------- */ - -/** - * The ompd_get_task_function returns the entry point of the code that - * corresponds to the body of code executed by the task. - */ - -#if 0 -ompd_rc_t ompd_get_task_function( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_address_t *entry_point /* OUT: first instruction in the task region */ - ); -#endif /* --- 8.2 Task Settings ---------------------------------------------------- */ diff --git a/libompd/src/ompd_test.c b/libompd/src/ompd_test.c index 92609a66a..f54385730 100644 --- a/libompd/src/ompd_test.c +++ b/libompd/src/ompd_test.c @@ -59,7 +59,7 @@ void test_CB_tsizeof_prim() { test_print_header(); ompd_rc_t ret; - ompd_target_type_sizes_t sizes; + ompd_device_type_sizes_t sizes; ret = callbacks->tsizeof_prim((ompd_context_t *)1, &sizes); if (ret == ompd_rc_ok) { printf("%-20s %du\n", "Size of char:", sizes.sizeof_char); From f9cc9b40cbb939f53cbea71fe5a9b10ffd9f371a Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 3 Jul 2018 09:19:26 -0700 Subject: [PATCH 07/64] [OMPD] Aligned callback names with spec --- libompd/gdb-wrapper/Callbacks.cpp | 20 ++++----- libompd/gdb-wrapper/Callbacks.h | 2 +- libompd/gdb-wrapper/CudaGdb.h | 1 + libompd/gdb-wrapper/OMPDCommand.cpp | 4 +- libompd/src/TargetValue.cpp | 63 +++++++++++++++-------------- libompd/src/omp-debug.cpp | 30 +++++++------- libompd/src/omp-debug.h | 8 ++-- libompd/src/ompd-private.h | 21 ++++++++++ libompd/src/ompd.h | 62 +++++++++------------------- 9 files changed, 105 insertions(+), 106 deletions(-) diff --git a/libompd/gdb-wrapper/Callbacks.cpp b/libompd/gdb-wrapper/Callbacks.cpp index d579bf7e1..ad739c3e5 100644 --- a/libompd/gdb-wrapper/Callbacks.cpp +++ b/libompd/gdb-wrapper/Callbacks.cpp @@ -37,16 +37,16 @@ void initializeCallbacks(const GdbProcessPtr &proc) gdb = proc; // Initialize static table - cb.dmemory_alloc = CB_dmemory_alloc; - cb.dmemory_free = CB_dmemory_free; - cb.print_string = CB_print_string; + cb.memory_alloc = CB_dmemory_alloc; + cb.memory_free = CB_dmemory_free; + cb.print_string = CB_print_string; cb.get_thread_context_for_thread_id = CB_thread_context; - cb.tsizeof_prim = CB_tsizeof_prim; - cb.tsymbol_addr = CB_tsymbol_addr; - cb.read_tmemory = CB_read_tmemory; - cb.write_tmemory = CB_write_tmemory; - cb.host_to_target = CB_host_to_target; - cb.target_to_host = CB_target_to_host; + cb.sizeof_types = CB_tsizeof_prim; + cb.symbol_addr_lookup = CB_tsymbol_addr; + cb.read_memory = CB_read_tmemory; + cb.write_memory = CB_write_tmemory; + cb.host_to_device = CB_host_to_target; + cb.device_to_host = CB_target_to_host; } ompd_callbacks_t * getCallbacksTable() @@ -125,7 +125,7 @@ void init_sizes(){ ompd_rc_t CB_tsizeof_prim( ompd_address_space_context_t *context, - ompd_target_type_sizes_t *sizes) + ompd_device_type_sizes_t *sizes) { ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle; static int inited = 0; diff --git a/libompd/gdb-wrapper/Callbacks.h b/libompd/gdb-wrapper/Callbacks.h index 349e30f11..040c7819e 100644 --- a/libompd/gdb-wrapper/Callbacks.h +++ b/libompd/gdb-wrapper/Callbacks.h @@ -59,7 +59,7 @@ ompd_rc_t CB_process_context ( ompd_rc_t CB_tsizeof_prim ( ompd_address_space_context_t *context, - ompd_target_type_sizes_t *sizes); + ompd_device_type_sizes_t *sizes); ompd_rc_t CB_tsymbol_addr ( ompd_address_space_context_t *context, diff --git a/libompd/gdb-wrapper/CudaGdb.h b/libompd/gdb-wrapper/CudaGdb.h index b690257b6..0408668b0 100644 --- a/libompd/gdb-wrapper/CudaGdb.h +++ b/libompd/gdb-wrapper/CudaGdb.h @@ -13,6 +13,7 @@ #include #include #include "ompd.h" +#include "../src/ompd-private.h" struct CudaThread { ompd_cudathread_coord_t coord; diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index d6ebb2984..e1582716b 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -25,7 +25,7 @@ using namespace ompd_gdb; using namespace std; -const char * ompd_state_names[256]; +const char * ompd_state_names[512]; extern OMPDHostContextPool * host_contextPool; /* --- OMPDCommandFactory --------------------------------------------------- */ @@ -35,7 +35,7 @@ OMPDCommandFactory::OMPDCommandFactory() functions = OMPDFunctionsPtr(new OMPDFunctions); #define ompd_state_macro(state, code) ompd_state_names[code] = #state; - FOREACH_OMPD_STATE(ompd_state_macro) + FOREACH_OMP_STATE(ompd_state_macro) #undef ompd_state_macro // Load OMPD DLL and get a handle diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index e4cb522b1..e81efe918 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -65,7 +65,7 @@ ompd_rc_t TType::getSize(ompd_size_t *size) { ompd_size_t tmpSize; std::stringstream ss; ss << "ompd_sizeof__" << typeName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -75,13 +75,13 @@ ompd_rc_t TType::getSize(ompd_size_t *size) { } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpSize)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host( - context, &tmpSize, ompd_sizeof(ompd_type_long_long), 1, &(typeSize)); + ret = TValue::callbacks->device_to_host( + context, &tmpSize, TValue::type_sizes.sizeof_long_long, 1, &(typeSize)); } *size = typeSize; return ret; @@ -98,7 +98,7 @@ ompd_rc_t TType::getBitfieldMask(const char *fieldName, // &fieldOffset); std::stringstream ss; ss << "ompd_bitfield__" << typeName << "__" << fieldName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -108,14 +108,14 @@ ompd_rc_t TType::getBitfieldMask(const char *fieldName, } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpMask)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host(context, &(tmpMask), - ompd_sizeof(ompd_type_long_long), 1, - &(bitfieldMask)); + ret = TValue::callbacks->device_to_host(context, &(tmpMask), + TValue::type_sizes.sizeof_long_long, + 1, &(bitfieldMask)); if (ret != ompd_rc_ok) { return ret; } @@ -135,7 +135,7 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) { // &fieldOffset); std::stringstream ss; ss << "ompd_access__" << typeName << "__" << fieldName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -145,14 +145,14 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) { } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpOffset)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host(context, &(tmpOffset), - ompd_sizeof(ompd_type_long_long), 1, - &fieldOffset); + ret = TValue::callbacks->device_to_host(context, &(tmpOffset), + TValue::type_sizes.sizeof_long_long, + 1, &fieldOffset); if (ret != ompd_rc_ok) { return ret; } @@ -172,7 +172,7 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) { // &fieldOffset); std::stringstream ss; ss << "ompd_sizeof__" << typeName << "__" << fieldName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -182,13 +182,14 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) { } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpOffset)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host( - context, &tmpOffset, ompd_sizeof(ompd_type_long_long), 1, &fieldSize); + ret = TValue::callbacks->device_to_host(context, &tmpOffset, + TValue::type_sizes.sizeof_long_long, + 1, &fieldSize); if (ret != ompd_rc_ok) { return ret; } @@ -230,7 +231,7 @@ TValue::TValue(ompd_address_space_context_t *_context, /*valueName(_valueName),*/ context(_context), tcontext(_tcontext), fieldSize(0) { errorState.errorCode = - callbacks->tsymbol_addr(context, tcontext, _valueName, &symbolAddr); + callbacks->symbol_addr_lookup(context, tcontext, _valueName, &symbolAddr); symbolAddr.segment = segment; // assert((ret==ompd_rc_ok) && "Callback call failed"); } @@ -278,14 +279,14 @@ TValue TValue::dereference() const { assert(pointerLevel > 0 && "cannot dereference non-pointer"); TValue ret = *this; ret.pointerLevel--; - ret.errorState.errorCode = callbacks->read_tmemory( - context, tcontext, symbolAddr, 1 * ompd_sizeof(ompd_type_pointer), + ret.errorState.errorCode = callbacks->read_memory( + context, tcontext, symbolAddr, 1 * TValue::type_sizes.sizeof_pointer, &(tmpAddr.address)); if (ret.errorState.errorCode != ompd_rc_ok) return ret; - ret.errorState.errorCode = callbacks->target_to_host( - context, &(tmpAddr.address), ompd_sizeof(ompd_type_pointer), 1, + ret.errorState.errorCode = callbacks->device_to_host( + context, &(tmpAddr.address), TValue::type_sizes.sizeof_pointer, 1, &(ret.symbolAddr.address)); if (ret.errorState.errorCode != ompd_rc_ok) { return ret; @@ -311,7 +312,7 @@ ompd_rc_t TValue::getRawValue(void *buf, int count) { return errorState.errorCode; errorState.errorCode = - callbacks->read_tmemory(context, tcontext, symbolAddr, size, buf); + callbacks->read_memory(context, tcontext, symbolAddr, size, buf); return errorState.errorCode; } @@ -386,12 +387,12 @@ TBaseValue::TBaseValue(const TValue &_tvalue, ompd_size_t _baseTypeSize) ompd_rc_t TBaseValue::getValue(void *buf, int count) { if (errorState.errorCode != ompd_rc_ok) return errorState.errorCode; - errorState.errorCode = callbacks->read_tmemory(context, tcontext, symbolAddr, + errorState.errorCode = callbacks->read_memory(context, tcontext, symbolAddr, count * baseTypeSize, buf); if (errorState.errorCode != ompd_rc_ok) return errorState.errorCode; errorState.errorCode = - callbacks->target_to_host(context, buf, baseTypeSize, count, buf); + callbacks->device_to_host(context, buf, baseTypeSize, count, buf); return errorState.errorCode; } @@ -399,7 +400,7 @@ ompd_rc_t TBaseValue::getValue(void *buf, int count) { // { // if( errorState.errorCode != ompd_rc_ok ) // return errorState.errorCode; -// errorState.errorCode = callbacks->read_tmemory(context, tcontext, +// errorState.errorCode = callbacks->read_memory(context, tcontext, // symbolAddr, // count, baseType, &(buf->th)); // assert((errorState.errorCode == ompd_rc_ok) && "Callback call failed"); diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index d772f6043..5f773b871 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -195,7 +195,7 @@ ompd_rc_t ompd_get_thread_in_parallel( if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), (void **)(thread_handle)); if (ret != ompd_rc_ok) return ret; @@ -210,7 +210,7 @@ ompd_rc_t ompd_release_thread_handle( ) { if (!thread_handle) return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(thread_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(thread_handle)); if (ret != ompd_rc_ok) return ret; return ompd_rc_ok; @@ -264,7 +264,7 @@ ompd_rc_t ompd_get_current_parallel_handle( if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), (void **)(parallel_handle)); if (ret != ompd_rc_ok) return ret; @@ -324,7 +324,7 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( return ret; } - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), (void **)(enclosing_parallel_handle)); if (ret != ompd_rc_ok) return ret; @@ -362,7 +362,7 @@ ompd_rc_t ompd_get_task_parallel_handle( if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), (void **)(enclosing_parallel_handle)); if (ret != ompd_rc_ok) return ret; @@ -378,7 +378,7 @@ ompd_rc_t ompd_release_parallel_handle( ) { if (!parallel_handle) return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(parallel_handle)); if (ret != ompd_rc_ok) return ret; return ompd_rc_ok; @@ -441,7 +441,7 @@ ompd_rc_t ompd_get_current_task_handle( if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(task_handle)); if (ret != ompd_rc_ok) return ret; @@ -501,7 +501,7 @@ ompd_rc_t ompd_get_generating_task_handle( return ret; } - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(parent_task_handle)); if (ret != ompd_rc_ok) return ret; @@ -539,7 +539,7 @@ ompd_rc_t ompd_get_scheduling_task_handle( if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(parent_task_handle)); if (ret != ompd_rc_ok) return ret; @@ -577,7 +577,7 @@ ompd_rc_t ompd_get_task_in_parallel( if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(task_handle)); if (ret != ompd_rc_ok) return ret; @@ -592,7 +592,7 @@ ompd_rc_t ompd_release_task_handle( ) { if (!task_handle) return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(task_handle)); if (ret != ompd_rc_ok) return ret; return ompd_rc_ok; @@ -835,7 +835,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t if (tId != p->threadIdx.x) return ompd_rc_stale_handle; - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), (void **)(thread_handle)); if (ret != ompd_rc_ok) return ret; @@ -861,7 +861,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t ret = th.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), (void **)(thread_handle)); if (ret != ompd_rc_ok) return ret; @@ -1418,12 +1418,12 @@ ompd_rc_t initTypeSizes(ompd_address_space_context_t *context) { static ompd_rc_t ret; if (inited) return ret; - ret = callbacks->tsizeof_prim(context, &type_sizes); + ret = callbacks->sizeof_types(context, &type_sizes); if (ret != ompd_rc_ok) return ret; if (!(type_sizes.sizeof_pointer > 0)) return ompd_rc_error; - ret = callbacks->tsizeof_prim(context, &TValue::type_sizes); + ret = callbacks->sizeof_types(context, &TValue::type_sizes); if (ret != ompd_rc_ok) return ret; inited = 1; diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index b4fdff0b1..c76321b75 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -48,25 +48,25 @@ class ompdAllocatable { public: static void *operator new(std::size_t sz) { void *res; - ompd_rc_t ret = callbacks->dmemory_alloc(sz, &res); + ompd_rc_t ret = callbacks->memory_alloc(sz, &res); if (ret == ompd_rc_ok) return res; throw std::bad_alloc(); } static void *operator new[](std::size_t sz) { void *res; - ompd_rc_t ret = callbacks->dmemory_alloc(sz, &res); + ompd_rc_t ret = callbacks->memory_alloc(sz, &res); if (ret == ompd_rc_ok) return res; throw std::bad_alloc(); } void operator delete(void *addr) throw() { - ompd_rc_t ret = callbacks->dmemory_free(addr); + ompd_rc_t ret = callbacks->memory_free(addr); if (ret != ompd_rc_ok) throw std::bad_alloc(); } void operator delete[](void *addr) throw() { - ompd_rc_t ret = callbacks->dmemory_free(addr); + ompd_rc_t ret = callbacks->memory_free(addr); if (ret != ompd_rc_ok) throw std::bad_alloc(); } diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h index b54275d91..96824c7cb 100644 --- a/libompd/src/ompd-private.h +++ b/libompd/src/ompd-private.h @@ -66,4 +66,25 @@ typedef enum ompd_target_prim_types_t { ompd_type_max } ompd_target_prim_types_t; +/** + * Logical coordinates of OMP target device threads + */ +typedef struct ompd_dim3_t { + ompd_word_t x; + ompd_word_t y; + ompd_word_t z; +} ompd_dim3_t; + +typedef struct ompd_cudathread_coord_t { + ompd_addr_t cudaDevId; + ompd_addr_t cudaContext; + ompd_addr_t warpSize; + ompd_addr_t gridId; + ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work + // with grids too well. + ompd_dim3_t gridDim; + ompd_dim3_t blockDim; + ompd_dim3_t blockIdx; + ompd_dim3_t threadIdx; +} ompd_cudathread_coord_t; #endif /*SRC_OMPD_PRIVATE_H*/ diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 8adb54fd0..b5d971d0c 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -132,28 +132,6 @@ typedef enum ompd_thread_id_kind_t { ompd_thread_id_cudalogical = 3 } ompd_thread_id_kind_t; -/** - * Logical coordinates of OMP target device threads - */ -typedef struct ompd_dim3_t { - ompd_word_t x; - ompd_word_t y; - ompd_word_t z; -} ompd_dim3_t; - -typedef struct ompd_cudathread_coord_t { - ompd_addr_t cudaDevId; - ompd_addr_t cudaContext; - ompd_addr_t warpSize; - ompd_addr_t gridId; - ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work - // with grids too well. - ompd_dim3_t gridDim; - ompd_dim3_t blockDim; - ompd_dim3_t blockIdx; - ompd_dim3_t threadIdx; -} ompd_cudathread_coord_t; - /** * Return codes. * Each OMPD operation returns a code. @@ -198,7 +176,7 @@ typedef struct ompd_device_type_sizes_t { /** * Allocate memory in the debugger's address space. */ -typedef ompd_rc_t (*ompd_dmemory_alloc_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)( ompd_size_t bytes, /* IN: bytes of the primitive type */ void **ptr /* OUT: pointer of the allocated memory */ ); @@ -206,14 +184,14 @@ typedef ompd_rc_t (*ompd_dmemory_alloc_fn_t)( /** * Free memory in the debugger's address space. */ -typedef ompd_rc_t (*ompd_dmemory_free_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)( void *ptr /* IN: pointer of memory to deallocate */ ); /** * Get thread specific context. */ -typedef ompd_rc_t (*ompd_get_thread_context_for_thread_id_fn_t)( +typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)( ompd_address_space_context_t *context, ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, const void *thread_id, ompd_thread_context_t **thread_context); @@ -221,7 +199,7 @@ typedef ompd_rc_t (*ompd_get_thread_context_for_thread_id_fn_t)( /** * Look up the sizes of primitive types in the target */ -typedef ompd_rc_t (*ompd_tsizeof_prim_fn_t)( +typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_device_type_sizes_t *sizes /* OUT: type sizes */ @@ -230,7 +208,7 @@ typedef ompd_rc_t (*ompd_tsizeof_prim_fn_t)( /** * Look up the address of a global symbol in the target */ -typedef ompd_rc_t (*ompd_tsymbol_addr_fn_t)( +typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_thread_context_t @@ -242,7 +220,7 @@ typedef ompd_rc_t (*ompd_tsymbol_addr_fn_t)( /** * Read memory from the target */ -typedef ompd_rc_t (*ompd_tmemory_read_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_thread_context_t @@ -255,7 +233,7 @@ typedef ompd_rc_t (*ompd_tmemory_read_fn_t)( /** * Write memory from the target */ -typedef ompd_rc_t (*ompd_tmemory_write_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_thread_context_t @@ -265,7 +243,7 @@ typedef ompd_rc_t (*ompd_tmemory_write_fn_t)( const void *buffer /* IN: output buffer */ ); -typedef ompd_rc_t (*ompd_target_host_fn_t)( +typedef ompd_rc_t (*ompd_callback_device_host_fn_t)( ompd_address_space_context_t *address_space_context, /* IN */ const void *input, /* IN */ int unit_size, /* IN */ @@ -278,7 +256,7 @@ typedef ompd_rc_t (*ompd_target_host_fn_t)( * This is used by the OMPD library to have the debugger print a string. * The OMPD should not print directly. */ -typedef ompd_rc_t (*ompd_print_string_fn_t)( +typedef ompd_rc_t (*ompd_callback_print_string_fn_t)( const char *str /* IN: message to print */ ); @@ -287,22 +265,20 @@ typedef ompd_rc_t (*ompd_print_string_fn_t)( */ typedef struct ompd_callbacks_t { /* Debugger interface */ - ompd_dmemory_alloc_fn_t dmemory_alloc; - ompd_dmemory_free_fn_t dmemory_free; - ompd_print_string_fn_t print_string; + ompd_callback_memory_alloc_fn_t memory_alloc; + ompd_callback_memory_free_fn_t memory_free; + ompd_callback_print_string_fn_t print_string; /* Target interface */ - ompd_tsizeof_prim_fn_t tsizeof_prim; - ompd_tsymbol_addr_fn_t tsymbol_addr; - ompd_tmemory_read_fn_t read_tmemory; - ompd_tmemory_write_fn_t write_tmemory; - - ompd_target_host_fn_t target_to_host; - ompd_target_host_fn_t host_to_target; + ompd_callback_sizeof_fn_t sizeof_types; + ompd_callback_symbol_addr_fn_t symbol_addr_lookup; + ompd_callback_memory_read_fn_t read_memory; + ompd_callback_memory_write_fn_t write_memory; - ompd_get_thread_context_for_thread_id_fn_t get_thread_context_for_thread_id; - // ompd_get_process_context_for_context_fn_t get_containing_process_context; + ompd_callback_device_host_fn_t device_to_host; + ompd_callback_device_host_fn_t host_to_device; + ompd_callback_get_thread_context_for_thread_id_fn_t get_thread_context_for_thread_id; } ompd_callbacks_t; /****************************************************************************** From cffa430f1ccc98640e0d95d84e1052cfea1b73f7 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 3 Jul 2018 11:36:02 -0700 Subject: [PATCH 08/64] Fix formatting mistake --- .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 704 +++++++++--------- 1 file changed, 352 insertions(+), 352 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index ebda05654..d01830872 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -66,376 +66,376 @@ #endif // arguments needed for L0 parallelism only. - class omptarget_nvptx_SharedArgs { +class omptarget_nvptx_SharedArgs { #if OMPD_SUPPORT - friend void __device__ ompd_init( void ); + friend void __device__ ompd_init( void ); #endif /* OMPD_SUPPORT */ - public: - // All these methods must be called by the master thread only. - INLINE void Init() { - args = buffer; - nArgs = MAX_SHARED_ARGS; +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); } - INLINE void DeInit() { - // Free any memory allocated for outlined parallel function with a large - // number of arguments. + } + INLINE void EnsureSize(size_t size) { + if (size > nArgs) { if (nArgs > MAX_SHARED_ARGS) { SafeFree(args, (char *)"new extended args"); - Init(); } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; } - INLINE void EnsureSize(size_t size) { - if (size > nArgs) { - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, (char *)"new extended args"); - } - args = (void **) SafeMalloc(size * sizeof(void *), - (char *)"new extended args"); - nArgs = size; - } - } - // Called by all threads. - INLINE void **GetArgs() { return args; }; - private: - // buffer of pre-allocated arguments. - void *buffer[MAX_SHARED_ARGS]; - // pointer to arguments buffer. - // starts off as a pointer to 'buffer' but can be dynamically allocated. - void **args; - // starts off as MAX_SHARED_ARGS but can increase in size. - uint32_t nArgs; - }; - - extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; - - // Data sharing related quantities, need to match what is used in the compiler. - enum DATA_SHARING_SIZES { - // The maximum number of workers in a kernel. - DS_Max_Worker_Threads = 992, - // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, - // The maximum number of warps in use - DS_Max_Warp_Number = 32, - }; - - // Data structure to keep in shared memory that traces the current slot, stack, - // and frame pointer as well as the active threads that didn't exit the current - // environment. - struct DataSharingStateTy { - __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; - void *StackPtr[DS_Max_Warp_Number]; - __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; - int32_t ActiveThreads[DS_Max_Warp_Number]; - }; - // Additional worker slot type which is initialized with the default worker slot - // size of 4*32 bytes. - struct __kmpc_data_sharing_worker_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Worker_Warp_Slot_Size]; - }; - // Additional master slot type which is initialized with the default master slot - // size of 4 bytes. - struct __kmpc_data_sharing_master_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Slot_Size]; - }; - extern __device__ __shared__ DataSharingStateTy DataSharingState; - - //////////////////////////////////////////////////////////////////////////////// - // task ICV and (implicit & explicit) task state - - class omptarget_nvptx_TaskDescr { + } + // Called by all threads. + INLINE void **GetArgs() { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, +}; + +// Data structure to keep in shared memory that traces the current slot, stack, +// and frame pointer as well as the active threads that didn't exit the current +// environment. +struct DataSharingStateTy { + __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; + void *StackPtr[DS_Max_Warp_Number]; + __kmpc_data_sharing_slot *TailPtr[DS_Max_Warp_Number]; + void *FramePtr[DS_Max_Warp_Number]; + int32_t ActiveThreads[DS_Max_Warp_Number]; +}; +// Additional worker slot type which is initialized with the default worker slot +// size of 4*32 bytes. +struct __kmpc_data_sharing_worker_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; +}; +// Additional master slot type which is initialized with the default master slot +// size of 4 bytes. +struct __kmpc_data_sharing_master_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Slot_Size]; +}; +extern __device__ __shared__ DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// task ICV and (implicit & explicit) task state + +class omptarget_nvptx_TaskDescr { #if OMPD_SUPPORT - friend void __device__ ompd_init( void ); - friend void __device__ ompd_set_device_thread_state(omp_state_t state); + friend void __device__ ompd_init( void ); + friend void __device__ ompd_set_device_thread_state(omp_state_t state); #endif /* OMPD_SUPPORT */ - public: - // methods for flags - INLINE omp_sched_t GetRuntimeSched(); - INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; } - INLINE void SetDynamic() { - items.flags = items.flags | TaskDescr_IsDynamic; - } - INLINE void ClearDynamic() { - items.flags = items.flags & (~TaskDescr_IsDynamic); - } - INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() { - return items.flags & TaskDescr_InParL2P; - } - INLINE int IsParallelConstruct() { - return items.flags & TaskDescr_IsParConstr; - } - INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } - // methods for other fields - INLINE uint16_t &NThreads() { return items.nthreads; } - INLINE uint16_t &ThreadLimit() { return items.threadlimit; } - INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } - INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } - INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } - INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { - prev = taskDescr; - } - // init & copy - INLINE void InitLevelZeroTaskDescr(); - INLINE void InitLevelOneTaskDescr(uint16_t tnum, - omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, - uint16_t tnum); - INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); - INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, - uint16_t tid, uint16_t tnum); - - private: - // bits for flags: (7 used, 1 free) - // 3 bits (SchedMask) for runtime schedule - // 1 bit (IsDynamic) for dynamic schedule (false = static) - // 1 bit (InPar) if this thread has encountered one or more parallel region - // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) - // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel - // region - static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); - static const uint8_t TaskDescr_IsDynamic = 0x8; - static const uint8_t TaskDescr_InPar = 0x10; - static const uint8_t TaskDescr_IsParConstr = 0x20; - static const uint8_t TaskDescr_InParL2P = 0x40; - - struct TaskDescr_items { - uint8_t flags; // 6 bit used (see flag above) - uint8_t unused; - uint16_t nthreads; // thread num for subsequent parallel regions - uint16_t threadlimit; // thread limit ICV - uint16_t threadId; // thread id - uint16_t threadsInTeam; // threads in current team - uint64_t runtimeChunkSize; // runtime chunk size - } items; +public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched(); + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int IsDynamic() { return items.flags & TaskDescr_IsDynamic; } + INLINE void SetDynamic() { + items.flags = items.flags | TaskDescr_IsDynamic; + } + INLINE void ClearDynamic() { + items.flags = items.flags & (~TaskDescr_IsDynamic); + } + INLINE int InParallelRegion() { return items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() { + return items.flags & TaskDescr_InParL2P; + } + INLINE int IsParallelConstruct() { + return items.flags & TaskDescr_IsParConstr; + } + INLINE int IsTaskConstruct() { return !IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t &NThreads() { return items.nthreads; } + INLINE uint16_t &ThreadLimit() { return items.threadlimit; } + INLINE uint16_t &ThreadId() { return items.threadId; } + INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; } + INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + prev = taskDescr; + } + // init & copy + INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelOneTaskDescr(uint16_t tnum, + omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, + uint16_t tnum); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + +private: + // bits for flags: (7 used, 1 free) + // 3 bits (SchedMask) for runtime schedule + // 1 bit (IsDynamic) for dynamic schedule (false = static) + // 1 bit (InPar) if this thread has encountered one or more parallel region + // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel + // region + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_IsDynamic = 0x8; + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + static const uint8_t TaskDescr_InParL2P = 0x40; + + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t nthreads; // thread num for subsequent parallel regions + uint16_t threadlimit; // thread limit ICV + uint16_t threadId; // thread id + uint16_t threadsInTeam; // threads in current team + uint64_t runtimeChunkSize; // runtime chunk size + } items; #ifdef OMPD_SUPPORT - ompd_nvptx_thread_info_t ompd_thread_info; + ompd_nvptx_thread_info_t ompd_thread_info; #endif - omptarget_nvptx_TaskDescr *prev; - }; - - // build on kmp - typedef struct omptarget_nvptx_ExplicitTaskDescr { - omptarget_nvptx_TaskDescr - taskDescr; // omptarget_nvptx task description (must be first) - kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) - } omptarget_nvptx_ExplicitTaskDescr; - - //////////////////////////////////////////////////////////////////////////////// - // Descriptor of a parallel region (worksharing in general) - - class omptarget_nvptx_WorkDescr { - - public: - // access to data - INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } - INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } - // init - INLINE void InitWorkDescr(); - - private: - omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) - omptarget_nvptx_TaskDescr masterTaskICV; - bool hasCancel; - }; - - //////////////////////////////////////////////////////////////////////////////// - - class omptarget_nvptx_TeamDescr { - public: - // access to data - INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { - return &levelZeroTaskDescr; - } - INLINE omptarget_nvptx_WorkDescr &WorkDescr() { - return workDescrForActiveParallel; - } - INLINE omp_lock_t *CriticalLock() { return &criticalLock; } - INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } - - // init - INLINE void InitTeamDescr(); - - INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { - // If this is invoked by the master thread of the master warp then intialize - // it with a smaller slot. - if (IsMasterThread) { - // Do not initalize this slot again if it has already been initalized. - if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) - return 0; - // Initialize the pointer to the end of the slot given the size of the - // data section. DataEnd is non-inclusive. - master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; - // We currently do not have a next slot. - master_rootS[0].Next = 0; - master_rootS[0].Prev = 0; - master_rootS[0].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&master_rootS[0]; - } + omptarget_nvptx_TaskDescr *prev; +}; + +// build on kmp +typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr + taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) +} omptarget_nvptx_ExplicitTaskDescr; + +//////////////////////////////////////////////////////////////////////////////// +// Descriptor of a parallel region (worksharing in general) + +class omptarget_nvptx_WorkDescr { + +public: + // access to data + INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; } + INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + // init + INLINE void InitWorkDescr(); + +private: + omptarget_nvptx_CounterGroup cg; // for barrier (no other needed) + omptarget_nvptx_TaskDescr masterTaskICV; + bool hasCancel; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class omptarget_nvptx_TeamDescr { +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + return &levelZeroTaskDescr; + } + INLINE omptarget_nvptx_WorkDescr &WorkDescr() { + return workDescrForActiveParallel; + } + INLINE omp_lock_t *CriticalLock() { return &criticalLock; } + INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } + + // init + INLINE void InitTeamDescr(); + + INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { + // If this is invoked by the master thread of the master warp then intialize + // it with a smaller slot. + if (IsMasterThread) { // Do not initalize this slot again if it has already been initalized. - if (worker_rootS[wid].DataEnd == - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) + if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) return 0; - // Initialize the pointer to the end of the slot given the size of the data - // section. DataEnd is non-inclusive. - worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // Initialize the pointer to the end of the slot given the size of the + // data section. DataEnd is non-inclusive. + master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; // We currently do not have a next slot. - worker_rootS[wid].Next = 0; - worker_rootS[wid].Prev = 0; - worker_rootS[wid].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + master_rootS[0].Next = 0; + master_rootS[0].Prev = 0; + master_rootS[0].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&master_rootS[0]; } - - private: - omptarget_nvptx_TaskDescr - levelZeroTaskDescr; // icv for team master initial thread - omptarget_nvptx_WorkDescr - workDescrForActiveParallel; // one, ONLY for the active par - omp_lock_t criticalLock; - uint64_t lastprivateIterBuffer; - - __align__(16) - __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1]; - __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; - }; - - //////////////////////////////////////////////////////////////////////////////// - // thread private data (struct of arrays for better coalescing) - // tid refers here to the global thread id - // do not support multiple concurrent kernel a this time - class omptarget_nvptx_ThreadPrivateContext { + // Do not initalize this slot again if it has already been initalized. + if (worker_rootS[wid].DataEnd == + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the data + // section. DataEnd is non-inclusive. + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + +private: + omptarget_nvptx_TaskDescr + levelZeroTaskDescr; // icv for team master initial thread + omptarget_nvptx_WorkDescr + workDescrForActiveParallel; // one, ONLY for the active par + omp_lock_t criticalLock; + uint64_t lastprivateIterBuffer; + + __align__(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1]; + __align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; +}; + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) +// tid refers here to the global thread id +// do not support multiple concurrent kernel a this time +class omptarget_nvptx_ThreadPrivateContext { #if OMPD_SUPPORT - friend void __device__ ompd_init( void ); + friend void __device__ ompd_init( void ); #endif /* OMPD_SUPPORT */ - public: - // task - INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { - return &levelOneTaskDescr[tid]; - } - INLINE void SetTopLevelTaskDescr(int tid, - omptarget_nvptx_TaskDescr *taskICV) { - topTaskDescr[tid] = taskICV; - } - INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); +public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + return &levelOneTaskDescr[tid]; + } + INLINE void SetTopLevelTaskDescr(int tid, + omptarget_nvptx_TaskDescr *taskICV) { + topTaskDescr[tid] = taskICV; + } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid); + // parallel + INLINE uint16_t &NumThreadsForNextParallel(int tid) { + return nextRegion.tnum[tid]; + } + // simd + INLINE uint16_t &SimdLimitForNextSimd(int tid) { + return nextRegion.slim[tid]; + } + // sync + INLINE Counter &Priv(int tid) { return priv[tid]; } + INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } + // schedule (for dispatch) + INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + INLINE int64_t &Chunk(int tid) { return chunk[tid]; } + INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + // state for dispatch with dyn/guided + INLINE Counter &CurrentEvent(int tid) { + return currEvent_or_nextLowerBound[tid]; + } + INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } + // state for dispatch with static + INLINE Counter &NextLowerBound(int tid) { + return currEvent_or_nextLowerBound[tid]; + } + INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } + + INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + + INLINE void InitThreadPrivateContext(int tid); + INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } + INLINE uint64_t GetSourceQueue() { return SourceQueue; } + +private: + // team context for this team + omptarget_nvptx_TeamDescr teamContext; + // task ICV for implict threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; + union { + // Only one of the two is live at the same time. // parallel - INLINE uint16_t &NumThreadsForNextParallel(int tid) { - return nextRegion.tnum[tid]; - } - // simd - INLINE uint16_t &SimdLimitForNextSimd(int tid) { - return nextRegion.slim[tid]; - } - // sync - INLINE Counter &Priv(int tid) { return priv[tid]; } - INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; } - // schedule (for dispatch) - INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } - INLINE int64_t &Chunk(int tid) { return chunk[tid]; } - INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - // state for dispatch with dyn/guided - INLINE Counter &CurrentEvent(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; } - // state for dispatch with static - INLINE Counter &NextLowerBound(int tid) { - return currEvent_or_nextLowerBound[tid]; - } - INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; } - - INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } - - INLINE void InitThreadPrivateContext(int tid); - INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; } - INLINE uint64_t GetSourceQueue() { return SourceQueue; } - - private: - // team context for this team - omptarget_nvptx_TeamDescr teamContext; - // task ICV for implict threads in the only parallel region - omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; - // pointer where to find the current task ICV (top of the stack) - omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; - union { - // Only one of the two is live at the same time. - // parallel - uint16_t tnum[MAX_THREADS_PER_TEAM]; - // simd limit - uint16_t slim[MAX_THREADS_PER_TEAM]; - } nextRegion; - // sync - Counter priv[MAX_THREADS_PER_TEAM]; - // schedule (for dispatch) - kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for - int64_t chunk[MAX_THREADS_PER_TEAM]; - int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; - // state for dispatch with dyn/guided OR static (never use both at a time) - Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; - Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; - // Queue to which this object must be returned. - uint64_t SourceQueue; - }; - - /// Device envrionment data - struct omptarget_device_environmentTy { - int32_t debug_level; - }; - - //////////////////////////////////////////////////////////////////////////////// - // global device envrionment - //////////////////////////////////////////////////////////////////////////////// - - extern __device__ omptarget_device_environmentTy omptarget_device_environment; - - //////////////////////////////////////////////////////////////////////////////// - - //////////////////////////////////////////////////////////////////////////////// - // global data tables - //////////////////////////////////////////////////////////////////////////////// - - extern __device__ __shared__ - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - extern __device__ __shared__ uint32_t execution_param; - extern __device__ __shared__ void *ReductionScratchpadPtr; - - //////////////////////////////////////////////////////////////////////////////// - // work function (outlined parallel/simd functions) and arguments. - // needed for L1 parallelism only. - //////////////////////////////////////////////////////////////////////////////// - - typedef void *omptarget_nvptx_WorkFn; - extern volatile __device__ __shared__ omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; - - //////////////////////////////////////////////////////////////////////////////// - // get private data structures - //////////////////////////////////////////////////////////////////////////////// - - INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); - INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); - INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); - INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); - - //////////////////////////////////////////////////////////////////////////////// - // inlined implementation - //////////////////////////////////////////////////////////////////////////////// + uint16_t tnum[MAX_THREADS_PER_TEAM]; + // simd limit + uint16_t slim[MAX_THREADS_PER_TEAM]; + } nextRegion; + // sync + Counter priv[MAX_THREADS_PER_TEAM]; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for + int64_t chunk[MAX_THREADS_PER_TEAM]; + int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; + // state for dispatch with dyn/guided OR static (never use both at a time) + Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM]; + Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; + // Queue to which this object must be returned. + uint64_t SourceQueue; +}; + +/// Device envrionment data +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +//////////////////////////////////////////////////////////////////////////////// +// global device envrionment +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ omptarget_device_environmentTy omptarget_device_environment; + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern __device__ __shared__ + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; +extern __device__ __shared__ uint32_t execution_param; +extern __device__ __shared__ void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// work function (outlined parallel/simd functions) and arguments. +// needed for L1 parallelism only. +//////////////////////////////////////////////////////////////////////////////// + +typedef void *omptarget_nvptx_WorkFn; +extern volatile __device__ __shared__ omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// get private data structures +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// inlined implementation +//////////////////////////////////////////////////////////////////////////////// #include "counter_groupi.h" #include "omptarget-nvptxi.h" From e72f4a75e326e375df18b05f335b7d8881375d23 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 3 Jul 2018 14:46:07 -0700 Subject: [PATCH 09/64] [OMPD] Add parallel and reduction state to npvtx --- libomptarget/deviceRTLs/nvptx/src/loop.cu | 6 ++++++ libomptarget/deviceRTLs/nvptx/src/ompd-specific.h | 7 ++++--- libomptarget/deviceRTLs/nvptx/src/reduction.cu | 13 +++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index 2c115a980..4127cffa2 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -360,6 +360,9 @@ public: omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId), omptarget_nvptx_threadPrivateContext->Chunk(teamId)); } +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_parallel); +#endif /*OMPD_SUPPORT*/ } //////////////////////////////////////////////////////////////////////////////// @@ -472,6 +475,9 @@ public: INLINE static void dispatch_fini() { // nothing +#ifdef OMP_SUPPORT + ompd_reset_device_thread_state() +#endif } //////////////////////////////////////////////////////////////////////////////// diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 38cf70e3b..8b33a0ea8 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -31,9 +31,10 @@ extern "C" __device__ void ompd_bp_task_end ( void ); /* we only support work states for the moment */ typedef enum { - omp_state_undefined = 0x102, - omp_state_work_serial = 0x000, - omp_state_work_parallel = 0x001 + omp_state_undefined = 0x102, + omp_state_work_serial = 0x000, + omp_state_work_parallel = 0x001, + omp_state_work_reduction = 0x002 } omp_state_t; __device__ void ompd_set_device_thread_state(omp_state_t); diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu index afa8e81eb..132f11fa4 100644 --- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -171,6 +171,9 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, * 3. Warp 0 reduces to a single value. * 4. The reduced value is available in the thread that returns 1. */ +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_reduction); +#endif /*OMPD_SUPPORT*/ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); @@ -248,6 +251,10 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized) == 0; #endif // __CUDA_ARCH__ >= 700 + +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ } EXTERN @@ -289,6 +296,9 @@ int32_t nvptx_teams_reduce_nowait( // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_reduction); +#endif /*OMPD_SUPPORT*/ uint32_t NumThreads = isSPMDExecutionMode ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, @@ -403,6 +413,9 @@ int32_t nvptx_teams_reduce_nowait( } #endif // __CUDA_ARCH__ >= 700 +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ return ThreadId == 0; } From 27e8cc85a37aa405dc227573517648250f383b66 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 5 Jul 2018 07:56:57 -0700 Subject: [PATCH 10/64] [OMPD] Add reduction state + save thread coords --- .../deviceRTLs/nvptx/src/ompd-specific.cu | 2 ++ .../deviceRTLs/nvptx/src/ompd-specific.h | 7 ++++++- libomptarget/deviceRTLs/nvptx/src/reduction.cu | 17 +++++++++++++---- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index 19dcbf52f..22a15ae45 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -29,6 +29,8 @@ __device__ __shared__ __device__ void ompd_init ( void ) { getMyTopTaskDescriptor()->ompd_thread_info.state = omp_state_undefined; + getMyTopTaskDescriptor()->ompd_thread_info.blockIdx_x = blockIdx.x; + getMyTopTaskDescriptor()->ompd_thread_info.threadIdx_x = threadIdx.x; if (ompd_target_initialized) return; diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 8b33a0ea8..4fb51f08a 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -44,7 +44,12 @@ INLINE void ompd_reset_device_thread_state() { } typedef struct { - uint64_t state; + uint64_t state; // In the host runtime we use the OMPT state. + // Here we need to have our own place to store it. + int blockIdx_x; // Libomptarget should only schedule task in one dimension. + // To store a unique identifier for the current thread, we + // simply store ThreadIdx.x and BlockIdx.x + int threadIdx_x; } ompd_nvptx_thread_info_t; #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu index 132f11fa4..42c454872 100644 --- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -206,8 +206,17 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, BlockThreadId); +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ + return BlockThreadId == 0; } + +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ + return BlockThreadId == 0; #else uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); @@ -246,15 +255,15 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, return BlockThreadId == 0; } +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ + // Get the OMP thread Id. This is different from BlockThreadId in the case of // an L2 parallel region. return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized) == 0; #endif // __CUDA_ARCH__ >= 700 - -#ifdef OMPD_SUPPORT - ompd_reset_device_thread_state(); -#endif /*OMPD_SUPPORT*/ } EXTERN From 322f3f642e160a3d05860c83cd028aebd8d8a381 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 10 Jul 2018 09:12:15 -0700 Subject: [PATCH 11/64] [OMPD] cleanup and + ompd parallel handle for cuda --- libompd/gdb-wrapper/OMPDCommand.cpp | 101 +++-- libompd/gdb-wrapper/OMPDCommand.h | 1 + libompd/src/CMakeLists.txt | 2 +- libompd/src/omp-debug.cpp | 586 ++++++---------------------- libompd/src/omp-debug.h | 4 +- libompd/src/omp-state.cpp | 93 +++++ libompd/src/ompd-private.h | 2 + libompd/src/ompd.h | 10 +- 8 files changed, 286 insertions(+), 513 deletions(-) create mode 100644 libompd/src/omp-state.cpp diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index e1582716b..13c2c6c97 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -70,11 +70,6 @@ OMPDCommandFactory::OMPDCommandFactory() FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) #undef OMPD_FIND_API_FUNCTION - //functions->test_CB_tsizeof_prim = - // (void (*)()) findFunctionInLibrary("test_CB_tsizeof_prim"); - //functions->test_CB_dmemory_alloc = - // (void (*)()) findFunctionInLibrary("test_CB_dmemory_alloc"); - // Initialize OMPD library ompd_callbacks_t *table = getCallbacksTable(); assert(table && "Invalid callbacks table"); @@ -95,11 +90,6 @@ FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) OMPDCommandFactory::~OMPDCommandFactory() { ompd_rc_t ret; -// ret = functions->ompd_process_finalize(prochandle); -// if (ret != ompd_rc_ok) -// { -// out << "ERROR: could not finalize target process\n"; -// } ret = functions->ompd_release_address_space_handle(addrhandle); if (ret != ompd_rc_ok) { @@ -185,6 +175,17 @@ const char* OMPDTestCallbacks::toString() const void OMPDThreads::execute() const { + // get state names + map host_state_names; + ompd_word_t more_states = 1; + ompd_word_t next_state = omp_state_undefined; + host_state_names[next_state] = "ompd_state_undefined"; + while (more_states) { + const char *state_name; + functions->ompd_enumerate_states(addrhandle, next_state, &next_state, &state_name, &more_states); + host_state_names[next_state] = state_name; + } + printf("\nHOST THREADS\n"); printf("Debugger_handle Thread_handle System_thread\n"); printf("--------------------------------------------------\n"); @@ -200,8 +201,8 @@ void OMPDThreads::execute() const ompd_word_t state; ompd_wait_id_t wait_id; ret = functions->ompd_get_state(thread_handle, &state, &wait_id); - printf(" %-12u %p 0x%lx\t%i\t%lx\n", - (unsigned int)i.first, thread_handle, i.second, state, wait_id); + printf(" %-12u %p 0x%lx\t%s\t%lx\n", + (unsigned int)i.first, thread_handle, i.second, host_state_names[state], wait_id); functions->ompd_release_thread_handle(thread_handle); } else @@ -217,10 +218,17 @@ void OMPDThreads::execute() const map address_spaces; ompd_word_t last_state = -1; ompd_cudathread_coord_t last_coords; + vector device_thread_handles; + + // get cuda states + map cuda_state_names; + more_states = 1; + next_state = omp_state_undefined; + cuda_state_names[next_state] = "omp_state_undefined"; printf("\nCUDA THREADS\n"); - printf("Cuda block from Thread to Thread state\n"); - printf("-------------------------------------------------\n"); + printf("Cuda block from Thread to Thread state\n"); + printf("------------------------------------------\n"); for(auto i: cuda.threads) { if (!device_initialized[i.coord.cudaContext]) { @@ -232,7 +240,7 @@ void OMPDThreads::execute() const result = functions->ompd_device_initialize( addrhandle, cpool->getGlobalOmpdContext(), - ompd_device_kind_cuda, + ompd_device_kind_cuda, sizeof(i.coord.cudaContext), &i.coord.cudaContext, &cpool->ompd_device_handle); @@ -243,6 +251,13 @@ void OMPDThreads::execute() const } address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle; + while (more_states) { + const char *state_name; + functions->ompd_enumerate_states(cpool->ompd_device_handle, + next_state, &next_state, + &state_name, &more_states); + cuda_state_names[next_state] = state_name; + } } ompd_thread_handle_t* thread_handle; @@ -255,26 +270,44 @@ void OMPDThreads::execute() const if (ret == ompd_rc_ok) { ompd_word_t state; + device_thread_handles.push_back(thread_handle); functions->ompd_get_state(thread_handle, &state, NULL); if (last_state == -1) { last_state = state; last_coords = i.coord; - printf("(%li,0,0) (%li,%li,%li)", i.coord.blockIdx.x, i.coord.threadIdx.x, i.coord.threadIdx.y, i.coord.threadIdx.z); + printf("(%li,0,0) (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x); } else if (state != last_state || i.coord.blockIdx.x != last_coords.blockIdx.x) { - printf(" (%li,%li,%li) %li\n", last_coords.threadIdx.x, last_coords.threadIdx.y, last_coords.threadIdx.z, last_state); + printf(" (%li,0,0) %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]); last_coords = i.coord; last_state = state; - printf("(%li,0,0) (%li,%li,%li)", i.coord.blockIdx.x, i.coord.threadIdx.x, i.coord.threadIdx.y, i.coord.threadIdx.z); + printf("(%li,0,0) (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x); } else { /* state == last_state*/ last_coords = i.coord; } - functions->ompd_release_thread_handle(thread_handle); omp_cuda_threads++; } } + // Check for non-unique handles + for (auto i: device_thread_handles) { + for (auto j: device_thread_handles) { + int value; + if (i == j) { + continue; + } + ompd_rc_t ret = functions->ompd_thread_handle_compare(i, j, &value); + if (!value) { + printf("FOUND NON-UNIQUE THREAD HANDLES FOR DIFFERENT THREADS\n"); + } + } + } + + // release thread handles + for (auto i: device_thread_handles) { + functions->ompd_release_thread_handle(i); + } if (last_state != -1) { - printf(" (%i,%i,%i) %i\n", last_coords.threadIdx.x, last_coords.threadIdx.y, last_coords.threadIdx.z, last_state); + printf(" (%li,0,0) %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]); } if (cuda.threads.size() != 0) { @@ -329,7 +362,7 @@ const char* OMPDLevels::toString() const void OMPDCallback::execute() const -{ +{ ompd_rc_t ret; if (extraArgs.empty() || extraArgs[0] == "help") @@ -337,7 +370,7 @@ void OMPDCallback::execute() const hout << "callbacks available: read_tmemory, ttype, ttype_sizeof, ttype_offset, tsymbol_addr" << endl << "Use \"odb callback \" to get more help on the usage" << endl; return; - } + } /*ompd_rc_t CB_read_tmemory ( ompd_context_t *context, @@ -390,7 +423,7 @@ const char* OMPDCallback ::toString() const } void OMPDApi::execute() const -{ +{ ompd_rc_t ret; if (extraArgs.empty() || extraArgs[0] == "help") @@ -398,7 +431,7 @@ void OMPDApi::execute() const hout << "API functions available: read_tmemory, ttype, ttype_sizeof, ttype_offset, tsymbol_addr" << endl << "Use \"odb api \" to get more help on the usage" << endl; return; - } + } //ompd_rc_t ompd_get_threads ( // ompd_context_t *context, /* IN: debugger handle for the target */ @@ -416,8 +449,8 @@ void OMPDApi::execute() const } ompd_thread_handle_t ** thread_handle_array; int num_handles; - - + + ret = functions->ompd_get_threads ( addrhandle, &thread_handle_array, &num_handles); if (ret != ompd_rc_ok) @@ -425,7 +458,7 @@ void OMPDApi::execute() const sout << num_handles << " OpenMP threads:" << endl; for (int i=0; i odbGetParallelRegions(OMPDFunctionsPtr functions ompd_parallel_handle_t * parallel_handle; vector parallel_handles; ret = functions->ompd_get_current_parallel_handle( - th, ¶llel_handle); + th, ¶llel_handle); while(ret == ompd_rc_ok) { parallel_handles.push_back(parallel_handle); ret = functions->ompd_get_enclosing_parallel_handle( - parallel_handle, ¶llel_handle); + parallel_handle, ¶llel_handle); } return parallel_handles; } @@ -552,7 +585,7 @@ vector odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_t ompd_task_handle_t *task_handle; vector task_handles; ret = functions->ompd_get_current_task_handle( - th, &task_handle); + th, &task_handle); while(ret == ompd_rc_ok) { task_handles.push_back(task_handle); @@ -577,7 +610,7 @@ vector odbGetImplicitTasks(OMPDFunctionsPtr functions, ompd #if 0 ompd_task_handle_t* task_handles; /*ret = */functions->ompd_get_task_in_parallel( - ph, &task_handles, &num_tasks); + ph, &task_handles, &num_tasks); for(int i=0; i odbGetImplicitTasks(OMPDFunctionsPtr functions, ompd } void OMPDTest::execute() const -{ +{ // ompd_rc_t ret; if (extraArgs.empty() || extraArgs[0] == "help") { hout << "Test suites available: threads, parallel, tasks" << endl; return; - } + } if (extraArgs[0] == "threads") { @@ -611,7 +644,7 @@ void OMPDTest::execute() const { auto parallel_h = odbGetParallelRegions(functions, thr_h); auto task_h = odbGetTaskRegions(functions, thr_h); - + sout << "Thread handle: 0x" << hex << thr_h << endl << "Parallel: "; for(auto ph: parallel_h) { diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index b3c671031..3ccc4e805 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -86,6 +86,7 @@ macro(ompd_get_parallel_id) \ macro(ompd_get_parallel_function) */\ macro(ompd_get_thread_handle) \ /*macro(ompd_get_osthread)*/ \ +macro(ompd_enumerate_states) \ macro(ompd_get_state) \ /*macro(ompd_get_max_threads) \ macro(ompd_get_thread_num) \ diff --git a/libompd/src/CMakeLists.txt b/libompd/src/CMakeLists.txt index 53a7e3e2c..5a80c026d 100644 --- a/libompd/src/CMakeLists.txt +++ b/libompd/src/CMakeLists.txt @@ -1,7 +1,7 @@ project (libompd) message( "LIBOMP_INCLUDE_DIR = ${LIBOMP_INCLUDE_DIR}" ) -add_library (ompd SHARED TargetValue.cpp omp-debug.cpp) +add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp) add_dependencies(ompd omp) # ensure generated import library is created first diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 5f773b871..e5dbc3901 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -12,18 +12,19 @@ */ #define NDEBUG 1 +w #include "omp-debug.h" #include "omp.h" -// #include +#include "ompd-private.h" #include "TargetValue.h" #include #include #include #include #include -#include +const ompd_callbacks_t *callbacks = nullptr; ompd_device_type_sizes_t type_sizes; uint64_t ompd_state; @@ -221,10 +222,10 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, int *cmp_value) { if (!thread_handle_1) return ompd_rc_stale_handle; - if (!thread_handle_2) - return ompd_rc_stale_handle; - *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; - return ompd_rc_ok; + if (!thread_handle_2) + return ompd_rc_stale_handle; + *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; + return ompd_rc_ok; } /* --- 4.2 Parallel Region Handles------------------------------------------- */ @@ -240,38 +241,60 @@ ompd_rc_t ompd_get_current_parallel_handle( if (!thread_handle->ah) return ompd_rc_stale_handle; ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) + ompd_thread_context_t *thread_context = thread_handle->thread_context; + if (!context || !thread_context) return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; - TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_team") /*__kmp_threads[t]->th.th_team*/ - .cast("kmp_team_p", 1) - .access("t"); /*__kmp_threads[t]->th.th_team->t*/ + ompd_rc_t ret; - ompd_rc_t ret = teamdata.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; + if (thread_handle->ah->kind == ompd_thread_id_cudalogical) { + ompd_address_t taddr; + TValue ph = TValue(context, thread_context, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + ret = ph.getAddress(&taddr) + if (ret != ompd_rc_ok) + return ret; - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = teamdata.cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; - ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(parallel_handle)); - if (ret != ompd_rc_ok) - return ret; + (*parallel_handle)->ah = thread_handle->ah; + (*parallel_handle)->th = taddr; + } else { + ompd_address_t taddr, lwt; - (*parallel_handle)->ah = thread_handle->ah; - (*parallel_handle)->th = taddr; - (*parallel_handle)->lwt = lwt; + TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_team") /*__kmp_threads[t]->th.th_team*/ + .cast("kmp_team_p", 1) + .access("t"); /*__kmp_threads[t]->th.th_team->t*/ + + ret = teamdata.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = teamdata.cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + + (*parallel_handle)->ah = thread_handle->ah; + (*parallel_handle)->th = taddr; + (*parallel_handle)->lwt = lwt; + } return ompd_rc_ok; } @@ -612,136 +635,6 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, return ompd_rc_ok; } - -/* --- 5 Process and Thread Settings ---------------------------------------- */ - -ompd_rc_t -ompd_get_num_procs(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; - - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - int nth; - ret = TValue(context, "__kmp_avail_proc") - .castBase("__kmp_avail_proc") - .getValue(nth); - *val = nth; - return ret; -} - -ompd_rc_t -ompd_get_thread_limit(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; - - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - int nth; - ret = - TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); - *val = nth; - return ret; -} - -/* --- 6 Parallel Region Inqueries ------------------------------------------ */ -/* --- 6.1 Settings --------------------------------------------------------- */ - -ompd_rc_t ompd_get_num_threads( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = ompd_rc_ok; - if (parallel_handle->lwt.address != 0) - *val = 1; - else { - uint32_t res; - ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_nproc") /*t.t_nproc*/ - .castBase() - .getValue(res); - *val = res; - } - return ret; -} - -ompd_rc_t ompd_get_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - uint32_t res; - - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_level") /*t.t_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; -} - -ompd_rc_t ompd_get_active_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - uint32_t res; - - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_active_level") /*t.t_active_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; -} - /* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ ompd_rc_t ompd_get_parallel_data( @@ -829,44 +722,44 @@ ompd_get_thread_handle(ompd_address_space_handle_t .castBase(ompd_type_short) .getValue(tId); - if (ret != ompd_rc_ok) - return ret; + if (ret != ompd_rc_ok) + return ret; - if (tId != p->threadIdx.x) - return ompd_rc_stale_handle; + if (tId != p->threadIdx.x) + return ompd_rc_stale_handle; - ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), - (void **)(thread_handle)); - if (ret != ompd_rc_ok) - return ret; + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), + (void **)(thread_handle)); + if (ret != ompd_rc_ok) + return ret; - (*thread_handle)->ah = addr_handle; - (*thread_handle)->th = taddr; -} else { - ret = TValue(context, tcontext, "__kmp_gtid") - .castBase("__kmp_gtid") - .getValue(tId); - if (ret != ompd_rc_ok) - return ret; + (*thread_handle)->ah = addr_handle; + (*thread_handle)->th = taddr; + } else { + ret = TValue(context, tcontext, "__kmp_gtid") + .castBase("__kmp_gtid") + .getValue(tId); + if (ret != ompd_rc_ok) + return ret; - if (tId < 0) // thread is no omp worker - return ompd_rc_unavailable; + if (tId < 0) // thread is no omp worker + return ompd_rc_unavailable; - TValue th = TValue(context, "__kmp_threads") // __kmp_threads - .cast("kmp_info_t", 2) - .getArrayElement(tId) /*__kmp_threads[t]*/ - .access("th"); /*__kmp_threads[t]->th*/ + TValue th = TValue(context, "__kmp_threads") // __kmp_threads + .cast("kmp_info_t", 2) + .getArrayElement(tId) /*__kmp_threads[t]*/ + .access("th"); /*__kmp_threads[t]->th*/ - ompd_address_t taddr; - ret = th.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; - ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), - (void **)(thread_handle)); - if (ret != ompd_rc_ok) - return ret; - (*thread_handle)->ah = addr_handle; - (*thread_handle)->th = taddr; + ompd_address_t taddr; + ret = th.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), + (void **)(thread_handle)); + if (ret != ompd_rc_ok) + return ret; + (*thread_handle)->ah = addr_handle; + (*thread_handle)->th = taddr; #ifndef NDEBUG if (ret != ompd_rc_ok) @@ -887,47 +780,17 @@ ompd_get_thread_handle(ompd_address_space_handle_t "Callback table not initialized!"); #endif } + (*thread_handle)->thread_context = tcontext; return ret; } ompd_rc_t ompd_get_thread_id( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id) { - if (kind != ompd_thread_id_pthread) + if (kind != ompd_thread_id_pthread && kind != ompd_thread_id_cudalogical) return ompd_rc_bad_input; - if (!thread_handle) - return ompd_rc_stale_handle; - if (!thread_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - ompd_size_t size; - ompd_rc_t ret = tf.getType(context, "kmp_thread_t").getSize(&size); - if (ret != ompd_rc_ok) - return ret; - if (sizeof_thread_id != size) + if (thread_handle->ah->kind != kind) return ompd_rc_bad_input; - - assert(callbacks && "Callback table not initialized!"); - - ret = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_info") /*__kmp_threads[t]->th.th_info*/ - .cast("kmp_desc_t") - .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ - .cast("kmp_desc_base_t") - .access("ds_thread") /*__kmp_threads[t]->th.th_info.ds.ds_thread*/ - .cast("kmp_thread_t") - .getRawValue(thread_id, 1); - return ret; -} - -ompd_rc_t ompd_get_thread_num( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *val /* OUT: number of the thread within the team */ - ) { - // __kmp_threads[8]->th.th_info.ds.ds_tid if (!thread_handle) return ompd_rc_stale_handle; if (!thread_handle->ah) @@ -935,19 +798,30 @@ ompd_rc_t ompd_get_thread_num( ompd_address_space_context_t *context = thread_handle->ah->context; if (!context) return ompd_rc_stale_handle; + ompd_rc_t ret; - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_info") /*__kmp_threads[t]->th.th_info*/ - .cast("kmp_desc_t") - .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ - .cast("kmp_desc_base_t") - .access("ds_tid") /*__kmp_threads[t]->th.th_info.ds.ds_tid*/ - .castBase() - .getValue(*val); + if (kind != ompd_thread_id_cudalogical) { + ret = ompd_rc_unsupported; + } else { + ompd_size_t size; + ret = tf.getType(context, "kmp_thread_t").getSize(&size); + if (ret != ompd_rc_ok) + return ret; + if (sizeof_thread_id != size) + return ompd_rc_bad_input; + + assert(callbacks && "Callback table not initialized!"); + + ret = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_info") /*__kmp_threads[t]->th.th_info*/ + .cast("kmp_desc_t") + .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ + .cast("kmp_desc_base_t") + .access("ds_thread") /*__kmp_threads[t]->th.th_info.ds.ds_thread*/ + .cast("kmp_thread_t") + .getRawValue(thread_id, 1); + } return ret; } @@ -1009,244 +883,6 @@ ompd_rc_t ompd_get_state( /* --- 8.1 Task Settings ---------------------------------------------------- */ -ompd_rc_t ompd_get_max_threads( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("nproc") // td->td_icvs.dynamic - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t ompd_in_parallel( // Why do we need a task context for _in_parallel? - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - ompd_rc_t ret; - - assert(callbacks && "Callback table not initialized!"); - - ret = TValue(context, "__kmp_root") // __kmp_root - .cast("kmp_root_t", 2) - .dereference() // (*__kmp_root) - .access("r") // (*__kmp_root)->r - .cast("kmp_base_root_t") - .access("r_in_parallel") // (*__kmp_root)->r.r_in_parallel - .castBase() - .getValue(*val); - if (ret != ompd_rc_ok) - return ret; - if (*val) - *val = 1; - - return ret; -} - -ompd_rc_t -ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_flags") // td->td_icvs - .cast("kmp_tasking_flags_t") - .check("final", val); // td->td_icvs.max_active_levels - - return ret; -} - -ompd_rc_t -ompd_get_dynamic(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("dynamic") // td->td_icvs.dynamic - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t -ompd_get_nested(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("nested") // td->td_icvs.nested - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t ompd_get_max_active_levels( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = - TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("max_active_levels") // td->td_icvs.max_active_levels - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t -ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *kind, /* OUT: Kind of OpenMP schedule*/ - ompd_word_t *modifier /* OUT: Schedunling modifier */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - TValue sched = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("sched") // td->td_icvs.sched - .cast("kmp_r_sched_t", 0); - - ompd_rc_t ret = sched - .access("r_sched_type") // td->td_icvs.sched.r_sched_type - .castBase() - .getValue(*kind); - if (ret != ompd_rc_ok) - return ret; - ret = sched - .access("chunk") // td->td_icvs.sched.r_sched_type - .castBase() - .getValue(*modifier); - return ret; -} - -ompd_rc_t -ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *bind /* OUT: Kind of proc-binding */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("proc_bind") // td->td_icvs.proc_bind - .castBase() - .getValue(*bind); - - return ret; -} - -ompd_rc_t -ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_flags") // td->td_flags - .cast("kmp_tasking_flags_t") - .check("tasktype", val); // td->td_flags.tasktype - *val ^= 1; // tasktype: explicit = 1, implicit = 0 => invert the value - return ret; -} - /* --- 8.2 OMPT Task Inquiry Analogues -------------------------------------- */ ompd_rc_t ompd_get_task_frame( diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index c76321b75..cdaeb2f44 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -32,7 +32,6 @@ extern "C" { #define STR(x) STR_HELPER(x) #include "ompd.h" -#include "ompd-private.h" /****************************************************************************** * General helper functions @@ -42,7 +41,7 @@ ompd_rc_t initTypeSizes(ompd_address_space_context_t *context); #ifdef __cplusplus } -static const ompd_callbacks_t *callbacks = NULL; +extern const ompd_callbacks_t *callbacks; class ompdAllocatable { public: @@ -91,6 +90,7 @@ typedef struct _ompd_device_handle_s : public ompdAllocatable { typedef struct _ompd_thread_handle_s : public ompdAllocatable { ompd_address_space_handle_t *ah; + ompd_thread_context_t *thread_context; ompd_address_t th; /* target handle */ } ompd_thread_handle_t; diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp new file mode 100644 index 000000000..e25974f06 --- /dev/null +++ b/libompd/src/omp-state.cpp @@ -0,0 +1,93 @@ +#include "ompd.h" +#include "ompd-private.h" +#include "omp-debug.h" +#include + +const char *get_ompd_state_name(ompd_word_t state) { + switch (state) { +#define ompd_state_macro(state, code) \ + case code: return #state ; + FOREACH_OMP_STATE(ompd_state_macro) +#undef ompd_state_macro + default: return NULL; + } +} + +const char *get_ompd_cuda_state_name(ompd_word_t state) { + switch (state) { + case omp_state_work_serial: + return "omp_state_work_serial"; + case omp_state_work_parallel: + return "omp_state_work_parallel"; + case omp_state_work_reduction: + return "omp_state_work_reduction"; + default: + return NULL; + } +} + +ompd_rc_t ompd_enumerate_states( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, ompd_word_t *next_state, + const char **next_state_name, ompd_word_t *more_enums) { + ompd_rc_t ret; + if (address_space_handle->kind == ompd_device_kind_cuda) { + // We only support a small number of states for cuda devices + *more_enums = 1; + switch (current_state) { + case omp_state_undefined: + *next_state = omp_state_work_serial; + break; + case omp_state_work_serial: + *next_state = omp_state_work_parallel; + break; + case omp_state_work_parallel: + *next_state = omp_state_work_reduction; + *more_enums = 0; + break; + default: + return ompd_rc_bad_input; + } + const char *find_next_state_name = get_ompd_cuda_state_name(*next_state); + char *next_state_name_cpy; + ret = callbacks->memory_alloc( + strlen(find_next_state_name) + 1, (void **)&next_state_name_cpy); + if (ret != ompd_rc_ok) { + return ret; + } + strcpy(next_state_name_cpy, get_ompd_cuda_state_name(*next_state)); + *next_state_name = next_state_name_cpy; + } else { + if (current_state > omp_state_undefined && + current_state >= OMPD_LAST_OMP_STATE) { + return ompd_rc_bad_input; + } + if (current_state == omp_state_undefined) { + (*next_state) = omp_state_work_serial; + (*next_state_name) = get_ompd_state_name(omp_state_work_serial); + (*more_enums) = 1; + return ompd_rc_ok; + } + const char *find_next_state_name; + *next_state = current_state + 1; + while (!(find_next_state_name = get_ompd_state_name(*next_state))) { + ++(*next_state); + } + + char *next_state_name_cpy; + ret = callbacks->memory_alloc(strlen(find_next_state_name) + 1, (void **)&next_state_name_cpy); + if (ret != ompd_rc_ok) { + return ret; + } + strcpy(next_state_name_cpy, find_next_state_name); + + *next_state_name = next_state_name_cpy; + + if (*next_state == OMPD_LAST_OMP_STATE) { + *more_enums = 0; + } else { + *more_enums = 1; + } + } + return ompd_rc_ok; +} diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h index 96824c7cb..c3cd3ee72 100644 --- a/libompd/src/ompd-private.h +++ b/libompd/src/ompd-private.h @@ -52,6 +52,8 @@ typedef enum omp_state_t { #undef ompd_state_macro } omp_state_t; +#define OMPD_LAST_OMP_STATE omp_state_overhead + /** * Primitive types. */ diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index b5d971d0c..7909b1695 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -359,7 +359,7 @@ ompd_rc_t ompd_finalize(void); * and/or destroying threads during or after the call, rendering useless the * vector of handles returned. */ - + /** * Retrieve handles for OpenMP threads in a parallel region. @@ -605,6 +605,14 @@ ompd_rc_t ompd_get_thread_num( * only difference between the OMPD and OMPT counterparts is that the OMPD * version must supply a thread handle to provide a context for this inquiry. */ +ompd_rc_t ompd_enumerate_states ( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, + ompd_word_t *next_state, + const char **next_state_name, + ompd_word_t *more_enums + ); + ompd_rc_t ompd_get_state( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_word_t *state, /* OUT: State of this thread */ From 8a25e599a310bf4cafcdac6ff2e502b153eb5969 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 13 Jul 2018 08:22:28 -0700 Subject: [PATCH 12/64] [OMPD] Remove unsupported api functions + fixes --- libompd/src/omp-debug.cpp | 9 +++------ libompd/src/omp-state.cpp | 4 ++-- libompd/src/ompd-private.h | 25 +++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index e5dbc3901..0e5819b65 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -12,7 +12,6 @@ */ #define NDEBUG 1 -w #include "omp-debug.h" #include "omp.h" @@ -249,12 +248,12 @@ ompd_rc_t ompd_get_current_parallel_handle( ompd_rc_t ret; - if (thread_handle->ah->kind == ompd_thread_id_cudalogical) { + if (thread_handle->ah->kind == ompd_device_kind_cuda) { ompd_address_t taddr; TValue ph = TValue(context, thread_context, "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED) - ret = ph.getAddress(&taddr) + OMPD_SEGMENT_CUDA_PTX_SHARED); + ret = ph.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; @@ -789,8 +788,6 @@ ompd_rc_t ompd_get_thread_id( ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id) { if (kind != ompd_thread_id_pthread && kind != ompd_thread_id_cudalogical) return ompd_rc_bad_input; - if (thread_handle->ah->kind != kind) - return ompd_rc_bad_input; if (!thread_handle) return ompd_rc_stale_handle; if (!thread_handle->ah) diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp index e25974f06..dd7a3e8e9 100644 --- a/libompd/src/omp-state.cpp +++ b/libompd/src/omp-state.cpp @@ -3,7 +3,7 @@ #include "omp-debug.h" #include -const char *get_ompd_state_name(ompd_word_t state) { +static const char *get_ompd_state_name(ompd_word_t state) { switch (state) { #define ompd_state_macro(state, code) \ case code: return #state ; @@ -13,7 +13,7 @@ const char *get_ompd_state_name(ompd_word_t state) { } } -const char *get_ompd_cuda_state_name(ompd_word_t state) { +static const char *get_ompd_cuda_state_name(ompd_word_t state) { switch (state) { case omp_state_work_serial: return "omp_state_work_serial"; diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h index c3cd3ee72..52e3eb3bb 100644 --- a/libompd/src/ompd-private.h +++ b/libompd/src/ompd-private.h @@ -54,6 +54,31 @@ typedef enum omp_state_t { #define OMPD_LAST_OMP_STATE omp_state_overhead + +#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) +#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1) +#define OMPD_SEGMENT_DATA ((ompd_seg_t)2) +/** + * The following definitions match with ptx information stored in DWARF + */ +#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) +#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) +#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) +#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) +#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) +#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) +#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) +#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) +#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) +#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) +#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) +#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) +#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) +#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) +#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) +#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) +#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) + /** * Primitive types. */ From 485b1327622b7a67709ca709f9830e8b8b104420 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 13 Jul 2018 08:23:31 -0700 Subject: [PATCH 13/64] [OMPD] add support for some ICVs to libompd --- libompd/src/CMakeLists.txt | 2 +- libompd/src/omp-icv.cpp | 246 +++++++++++++++++++++++++++++++++++++ libompd/src/ompd.h | 193 +++++------------------------ 3 files changed, 277 insertions(+), 164 deletions(-) create mode 100644 libompd/src/omp-icv.cpp diff --git a/libompd/src/CMakeLists.txt b/libompd/src/CMakeLists.txt index 5a80c026d..60a01769e 100644 --- a/libompd/src/CMakeLists.txt +++ b/libompd/src/CMakeLists.txt @@ -1,7 +1,7 @@ project (libompd) message( "LIBOMP_INCLUDE_DIR = ${LIBOMP_INCLUDE_DIR}" ) -add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp) +add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp omp-icv.cpp) add_dependencies(ompd omp) # ensure generated import library is created first diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp new file mode 100644 index 000000000..10e67ec31 --- /dev/null +++ b/libompd/src/omp-icv.cpp @@ -0,0 +1,246 @@ +#include "omp-debug.h" +#include "ompd-private.h" +#include "TargetValue.h" + +#define FOREACH_OMPD_ICV(macro) \ + macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space) \ + macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space) \ + macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread) \ + macro (final_var, "ompd-final-var", ompd_scope_task) \ + macro (implicit_var, "ompd-implicit-var", ompd_scope_task) \ + macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel) \ + + +enum ompd_icv { + ompd_icv_undefined_marker = 0, // ompd_icv_undefined is already defined in ompd.h +#define ompd_icv_macro(v, n, s) ompd_icv_ ## v, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro + ompd_icv_after_last_icv +}; + +static const char *ompd_icv_string_values[] = { + "undefined", +#define ompd_icv_macro(v, n, s) n, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro +}; + +static const ompd_scope_t ompd_icv_scope_values[] = { + ompd_scope_global, // undefined marker +#define ompd_icv_macro(v, n, s) s, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro +}; + +ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, + ompd_icv_id_t current, ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, + int *more) { + if (!handle) { + return ompd_rc_stale_handle; + } + if (handle->kind == ompd_device_kind_cuda) { + return ompd_rc_unsupported; + } + if (current + 2 >= ompd_icv_after_last_icv) { + return ompd_rc_bad_input; + } + + *next_id = current + 1; + *next_icv_name = ompd_icv_string_values[*next_id]; + *next_scope = ompd_icv_scope_values[*next_id]; + + if ((*next_id) + 1 >= ompd_icv_after_last_icv) { + *more = 0; + } else { + *more = 1; + } + + return ompd_rc_ok; +} + + +static ompd_rc_t +ompd_get_num_procs(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: number of processes */ + ) { + ompd_address_space_context_t *context = addr_handle->context; + if (!context) + return ompd_rc_stale_handle; + ompd_rc_t ret; + + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + int nth; + ret = TValue(context, "__kmp_avail_proc") + .castBase("__kmp_avail_proc") + .getValue(nth); + *val = nth; + return ret; +} + +static ompd_rc_t +ompd_get_thread_limit(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; + + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + int nth; + ret = + TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); + *val = nth; + return ret; +} + +static ompd_rc_t ompd_get_thread_num( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *val /* OUT: number of the thread within the team */ + ) { + // __kmp_threads[8]->th.th_info.ds.ds_tid + if (!thread_handle) + return ompd_rc_stale_handle; + if (!thread_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = thread_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_info") /*__kmp_threads[t]->th.th_info*/ + .cast("kmp_desc_t") + .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ + .cast("kmp_desc_base_t") + .access("ds_tid") /*__kmp_threads[t]->th.th_info.ds.ds_tid*/ + .castBase() + .getValue(*val); + return ret; +} + +static ompd_rc_t +ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_flags") // td->td_icvs + .cast("kmp_tasking_flags_t") + .check("final", val); // td->td_icvs.max_active_levels + + return ret; +} + +static ompd_rc_t +ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_flags") // td->td_flags + .cast("kmp_tasking_flags_t") + .check("tasktype", val); // td->td_flags.tasktype + *val ^= 1; // tasktype: explicit = 1, implicit = 0 => invert the value + return ret; +} + +static ompd_rc_t +ompd_get_num_threads(ompd_parallel_handle_t + *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: number of threads */ + ) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = ompd_rc_ok; + if (parallel_handle->lwt.address != 0) { + *val = 1; + } else { + uint32_t res; + ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_nproc") /*t.t_nproc*/ + .castBase() + .getValue(res); + *val = res; + } + return ret; +} + +ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, + ompd_word_t *icv_value) { + if (!handle) { + return ompd_rc_stale_handle; + } + if (icv_id >= ompd_icv_after_last_icv || icv_id == 0) { + return ompd_rc_bad_input; + } + if (scope != ompd_icv_scope_values[icv_id]) { + return ompd_rc_bad_input; + } + + switch (icv_id) { + case ompd_icv_thread_limit_var: + return ompd_get_thread_limit((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_num_procs_var: + return ompd_get_num_procs((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_thread_num_var: + return ompd_get_thread_num((ompd_thread_handle_t*)handle, icv_value); + case ompd_icv_final_var: + return ompd_in_final((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_implicit_var: + return ompd_is_implicit((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_team_size_var: + return ompd_get_num_threads((ompd_parallel_handle_t*)handle, icv_value); + default: + return ompd_rc_unsupported; + } +} + +ompd_rc_t +ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, + const char **icv_string) { + return ompd_rc_unsupported; +} diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 7909b1695..57531d111 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -56,32 +56,6 @@ typedef struct ompd_address_t { ompd_addr_t address; /* target address in the segment */ } ompd_address_t; -#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1) -#define OMPD_SEGMENT_DATA ((ompd_seg_t)2) - -/** - * The following definitions match with ptx information stored in DWARF - */ -#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) -#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) -#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) -#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) -#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) -#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) -#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) -#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) -#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) -#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) -#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) -#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) -#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) -#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) -#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) -#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) - - typedef uint64_t ompd_device_identifier_t; typedef enum ompd_device_kind_t { @@ -132,6 +106,21 @@ typedef enum ompd_thread_id_kind_t { ompd_thread_id_cudalogical = 3 } ompd_thread_id_kind_t; +/** + * Scope for ICVs + */ +typedef enum ompd_scope_t { + ompd_scope_global = 1, + ompd_scope_address_space = 2, + ompd_scope_thread = 3, + ompd_scope_parallel = 4, + ompd_scope_implicit_task = 5, + ompd_scope_task = 6 +} ompd_scope_t; + +typedef uint64_t ompd_icv_id_t; +const uint64_t ompd_icv_undefined = 0; + /** * Return codes. * Each OMPD operation returns a code. @@ -492,70 +481,6 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, ompd_task_handle_t *task_handle_2, int *cmp_value); -/* --- 5o Process and Thread Settings ---------------------------------------- - */ - -/** - * The functions ompd_get_num_procs and ompd_get_thread_limit are third-party - * versions of the OpenMP runtime functions omp_get_num_procs and - * omp_get_thread_limit. - */ - -ompd_rc_t -ompd_get_num_procs(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ); - -ompd_rc_t -ompd_get_thread_limit(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ); - -/* --- 6 Parallel Region Inqueries ------------------------------------------ */ -/* --- 6.1 Settings --------------------------------------------------------- */ - -/** - * Determine the number of threads associated with a parallel region. - */ -ompd_rc_t ompd_get_num_threads( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ); - -/** - * Determine the nesting depth of a particular parallel region instance. - */ -ompd_rc_t ompd_get_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ); - -/** - * Determine the number of enclosing active parallel regions. - * - * ompd_get_active_level returns the number of nested, active parallel regions - * enclosing the parallel region specified by its handle. - */ -ompd_rc_t ompd_get_active_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ); - -/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ - -/** - * The functions ompd_get_parallel_id and ompd_get_parallel_function are - * third-party variants of their OMPT counterparts. The only difference between - * the OMPD and OMPT versions is that the OMPD must supply a parallel region - * handle to provide a context for these inquiries. - */ -ompd_rc_t ompd_get_parallel_data( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *data /* OUT: OpenMP parallel id */ - ); - /* --- 7 Thread Inquiry ----------------------------------------------------- */ /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ @@ -584,16 +509,6 @@ ompd_rc_t ompd_get_thread_id( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id); -ompd_rc_t ompd_get_thread_data( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_address_t *data /* OUT: OpenMP thread data */ - ); - -ompd_rc_t ompd_get_thread_num( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *val /* OUT: number of the thread within the team */ - ); - /* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */ /** @@ -620,64 +535,6 @@ ompd_rc_t ompd_get_state( ); /* --- 8 Task Inquiry ------------------------------------------------------- */ - - -/* --- 8.2 Task Settings ---------------------------------------------------- */ - -/** - * Retrieve information from OpenMP tasks. These inquiry functions have no - * counterparts in the OMPT interface as a first-party tool can call OpenMP - * runtime inquiry functions directly. The only difference between the OMPD - * inquiry operations and their counterparts in the OpenMP runtime is that the - * OMPD version must supply a task handle to provide a context for each inquiry. - */ - -ompd_rc_t ompd_get_max_threads( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ); - -ompd_rc_t -ompd_in_parallel(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: Is OpenMP in parallel? */ - ); - -ompd_rc_t -ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: Is OpenMP in final? */ - ); - -ompd_rc_t -ompd_get_dynamic(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: ? */ - ); - -ompd_rc_t -ompd_get_nested(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_word_t *val /* OUT: Is this task nested? */ - ); - -ompd_rc_t ompd_get_max_active_levels( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_word_t *val /* OUT: max active levels */ - ); - -ompd_rc_t -ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *kind, /* OUT: Kind of OpenMP schedule*/ - ompd_word_t *modifier /* OUT: Schedunling modifier */ - ); - -ompd_rc_t -ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *bind /* OUT: Kind of proc-binding */ - ); - -ompd_rc_t -ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: implicit=1, explicit=0 */ - ); - /* --- 8.3 OMPT Task Inquiry Analogues -------------------------------------- */ /** @@ -710,11 +567,6 @@ ompd_rc_t ompd_get_task_frame( ompd_address_t *sp_reentry /* OUT: previous frame is user code */ ); -ompd_rc_t -ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_address_t *task_data /* OUT: OpenMP task ID */ - ); - /* --- 13 Display Control Variables ----------------------------------------- */ /** @@ -739,6 +591,21 @@ ompd_rc_t ompd_release_display_control_vars( const char *const **control_var_values /* IN */ ); +/* --- Internal Control Variables ------------------------------------------- */ + +ompd_rc_t +ompd_enumerate_icvs(ompd_address_space_handle_t *handle, ompd_icv_id_t current, + ompd_icv_id_t *next_id, const char **next_icv_name, + ompd_scope_t *next_scope, int *more); + +ompd_rc_t +ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, ompd_icv_id_t icv_id, + ompd_word_t *icv_value); + +ompd_rc_t +ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, const char **icv_string); + #ifdef __cplusplus } #endif From f37acc32d57969d15d4887654f88d4381e9103da Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 13 Jul 2018 08:27:19 -0700 Subject: [PATCH 14/64] [OMPD] Add serial state in cuda device rtl --- libomptarget/deviceRTLs/nvptx/src/parallel.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index b6ef81b27..b992e8929 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -412,6 +412,9 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_serial); +#endif /*OMPD_SUPPORT*/ } EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, @@ -426,6 +429,9 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, threadId, currTaskDescr->GetPrevTaskDescr()); // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_parallel); +#endif /*OMPD_SUPPORT*/ } EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { From 4c40f17771a4c132d00e30b325a9bd4eb00e6d1a Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 13 Jul 2018 16:14:37 -0700 Subject: [PATCH 15/64] [OMPD] Add some icvs + move values in ompd_types.h --- libompd/gdb-wrapper/OMPDCommand.cpp | 125 ++++++++++++++++++++-- libompd/gdb-wrapper/OMPDCommand.h | 38 +++++++ libompd/gdb-wrapper/ompd_typedefs.h | 15 +++ libompd/src/omp-debug.cpp | 18 ++-- libompd/src/omp-debug.h | 4 +- libompd/src/omp-icv.cpp | 159 ++++++++++++++++++++++++++-- libompd/src/omp-state.cpp | 2 +- libompd/src/ompd-private.h | 21 +--- libompd/src/ompd.h | 42 +------- libompd/src/ompd_types.h | 45 ++++++++ 10 files changed, 385 insertions(+), 84 deletions(-) create mode 100644 libompd/src/ompd_types.h diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 13c2c6c97..2e364779c 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -25,19 +25,55 @@ using namespace ompd_gdb; using namespace std; -const char * ompd_state_names[512]; extern OMPDHostContextPool * host_contextPool; +/* --- OMPDIcvs ------------------------------------------------------------- */ + +OMPDIcvs::OMPDIcvs(OMPDFunctionsPtr functions, + ompd_address_space_handle_t *addrhandle) + : functions(functions) { + ompd_icv_id_t next_icv_id = ompd_icv_undefined; + int more = 1; + const char *next_icv_name_str; + ompd_scope_t next_scope; + ompd_rc_t ret = ompd_rc_ok; + while (more && ret == ompd_rc_ok) { + ret = functions->ompd_enumerate_icvs(addrhandle, + next_icv_id, + &next_icv_id, + &next_icv_name_str, + &next_scope, + &more); + if (ret == ompd_rc_ok) { + availableIcvs[next_icv_name_str] = + std::pair(next_icv_id, next_scope); + } + } +} + + +ompd_rc_t OMPDIcvs::get(ompd_parallel_handle_t *handle, const char *name, + ompd_word_t *value) { + ompd_icv_id_t icv; + ompd_scope_t scope; + + auto &p = availableIcvs.at(name); + icv = p.first; + scope = p.second; + + if (scope != ompd_scope_parallel) { + return ompd_rc_bad_input; + } + + return functions->ompd_get_icv_from_scope((void *)handle, scope, icv, value); +} + /* --- OMPDCommandFactory --------------------------------------------------- */ OMPDCommandFactory::OMPDCommandFactory() { functions = OMPDFunctionsPtr(new OMPDFunctions); -#define ompd_state_macro(state, code) ompd_state_names[code] = #state; - FOREACH_OMP_STATE(ompd_state_macro) -#undef ompd_state_macro - // Load OMPD DLL and get a handle #ifdef ODB_LINUX functions->ompdLibHandle = dlopen("libompd.so", RTLD_LAZY); @@ -85,6 +121,8 @@ FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) { out << "ERROR: could not initialize target process\n"; } + + icvs = OMPDIcvsPtr(new OMPDIcvs(functions, addrhandle)); } OMPDCommandFactory::~OMPDCommandFactory() @@ -128,7 +166,8 @@ OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& e return new OMPDApi(functions, addrhandle, extraArgs); else if (strcmp(str, "testapi") == 0) return new OMPDTest(functions, addrhandle, extraArgs); - + else if (strcmp(str, "parallel") == 0) + return new OMPDParallelRegions(functions, addrhandle, icvs, extraArgs); return new OMPDNull; } @@ -688,3 +727,77 @@ const char* OMPDTest::toString() const { return "odb api"; } + +void OMPDParallelRegions::execute() const +{ + ompd_rc_t ret; + vector host_thread_handles; + // get all thread handles + auto thread_ids = getThreadIDsFromDebugger(); + for(auto i: thread_ids) { + ompd_thread_handle_t* thread_handle; + ret = functions->ompd_get_thread_handle( + addrhandle, ompd_thread_id_pthread, sizeof(i.second), + &(i.second), &thread_handle); + if (ret == ompd_rc_ok) + { + host_thread_handles.push_back(thread_handle); + } + } + + // get parallel handles for thread handles + ParallelMap host_parallel_handles; + for (auto t: host_thread_handles) { + ompd_parallel_handle_t *parallel_handle; + ret = functions->ompd_get_current_parallel_handle(t, ¶llel_handle); + if (ret != ompd_rc_ok) { + continue; + } + ompd_parallel_handle_t *key = parallel_handle_in_map(parallel_handle, + host_parallel_handles); + if (key) { + host_parallel_handles[key].push_back(t); + functions->ompd_release_parallel_handle(parallel_handle); + } else { + host_parallel_handles[parallel_handle].push_back(t); + } + } + + printf("HOST PARALLEL REGIONS\n"); + printf("Parallel Handle Num Threads ICV Num Threads ICV level ICV active level\n"); + printf("------------------------------------------------------------------------------\n"); + for (auto &p: host_parallel_handles) { + ompd_word_t icv_num_threads, icv_level, icv_active_level; + icvs->get(p.first, "ompd-team-size-var", &icv_num_threads); + icvs->get(p.first, "levels-var", &icv_level); + icvs->get(p.first, "active-levels-var", &icv_active_level); + printf("%-15p %-10zu %-15llu %-9llu %llu\n", p.first, p.second.size(), icv_num_threads, icv_level, icv_active_level); + } + + for (auto t: host_thread_handles) { + functions->ompd_release_thread_handle(t); + } + for (auto &p: host_parallel_handles) { + functions->ompd_release_parallel_handle(p.first); + } +} + +const char *OMPDParallelRegions::toString() const +{ + return "odb parallel"; +} + + +ompd_parallel_handle_t *OMPDParallelRegions::parallel_handle_in_map(ompd_parallel_handle_t *handle, + std::map> parallel_handles) const +{ + for (ParallelMap::const_iterator iter = parallel_handles.cbegin(); + iter != parallel_handles.cend(); iter++) { + int cmp; + functions->ompd_parallel_handle_compare(iter->first, handle, &cmp); + if (!cmp) { + return iter->first; + } + } + return NULL; +} diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index 3ccc4e805..ac1da08ce 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "ompd.h" #include "ompd_typedefs.h" //#include "ompd_test.h" @@ -100,6 +101,8 @@ macro(ompd_get_proc_bind)*/ \ macro(ompd_get_task_frame) \ /*macro(ompd_get_task_id) */\ macro(ompd_get_api_version) \ +macro(ompd_enumerate_icvs) \ +macro(ompd_get_icv_from_scope) \ /*macro(ompd_get_version_string) \*/ @@ -136,6 +139,20 @@ FOREACH_OMPD_API_FN(OMPD_API_FUNCTION_POINTER_MEMBER) typedef std::shared_ptr OMPDFunctionsPtr; +class OMPDIcvs +{ +private: + OMPDFunctionsPtr functions; + std::map> availableIcvs; +public: + OMPDIcvs(OMPDFunctionsPtr functions, + ompd_address_space_handle_t *addrhandle); + ompd_rc_t get(ompd_parallel_handle_t *handle, const char *name, + ompd_word_t *value); +}; + +typedef std::shared_ptr OMPDIcvsPtr; + class OMPDCommand; class OMPDCommandFactory @@ -143,6 +160,7 @@ class OMPDCommandFactory private: void * findFunctionInLibrary(const char *fun) const; OMPDFunctionsPtr functions = nullptr; + OMPDIcvsPtr icvs = nullptr; // ompd_process_handle_t* prochandle = nullptr; ompd_address_space_handle_t* addrhandle = nullptr; OutputString out; @@ -280,6 +298,26 @@ class OMPDTest : public OMPDCommand friend OMPDCommandFactory; }; +class OMPDParallelRegions : public OMPDCommand +{ + typedef std::map> ParallelMap; +public: + ~OMPDParallelRegions() {}; + void execute() const; + const char *toString() const; +protected: + OMPDParallelRegions(const OMPDFunctionsPtr &f, + ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs, + const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {}; + + friend OMPDCommandFactory; +private: + OMPDIcvsPtr icvs; + ompd_parallel_handle_t *parallel_handle_in_map(ompd_parallel_handle_t *handle, + std::map>) const; +}; + } #endif /* GDB_OMPDCOMMAND_H_ */ diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h index 39fe07a8d..28fc39d57 100644 --- a/libompd/gdb-wrapper/ompd_typedefs.h +++ b/libompd/gdb-wrapper/ompd_typedefs.h @@ -185,3 +185,18 @@ typedef ompd_rc_t (*ompd_get_state_fn_t) ( ompd_wait_id_t *wait_id /* OUT: Wait ID */ ); +typedef ompd_rc_t (*ompd_enumerate_icvs_fn_t) ( + ompd_address_space_handle_t *handle, + ompd_icv_id_t current, + ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, + int *more + ); + +typedef ompd_rc_t (*ompd_get_icv_from_scope_fn_t) ( + void *handle, + ompd_scope_t scope, + ompd_icv_id_t icv_id, + ompd_word_t *icv_value + ); diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 0e5819b65..c48d5a354 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -60,7 +60,7 @@ ompd_process_initialize(ompd_address_space_context_t if (!addrhandle) return ompd_rc_error; (*addrhandle)->context = context; - (*addrhandle)->kind = ompd_device_kind_host; + (*addrhandle)->kind = OMP_DEVICE_KIND_HOST; return ompd_rc_ok; } @@ -149,7 +149,7 @@ ompd_rc_t ompd_device_initialize( if (!device_handle) return ompd_rc_error; (*device_handle)->context = device_context; - (*device_handle)->kind = ompd_device_kind_cuda; + (*device_handle)->kind = OMP_DEVICE_KIND_CUDA; (*device_handle)->id = (uint64_t)id; return ompd_rc_ok; } @@ -248,7 +248,7 @@ ompd_rc_t ompd_get_current_parallel_handle( ompd_rc_t ret; - if (thread_handle->ah->kind == ompd_device_kind_cuda) { + if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { ompd_address_t taddr; TValue ph = TValue(context, thread_context, "omptarget_nvptx_threadPrivateContext", @@ -677,7 +677,7 @@ ompd_rc_t ompd_get_parallel_data( ompd_rc_t ompd_get_thread_handle(ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, const void *thread_id, ompd_thread_handle_t **thread_handle) { if (!addr_handle) @@ -697,7 +697,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t int tId; - if (kind == ompd_thread_id_cudalogical) { + if (kind == OMPD_THREAD_ID_CUDALOGICAL) { ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->items.threadId @@ -785,8 +785,8 @@ ompd_get_thread_handle(ompd_address_space_handle_t ompd_rc_t ompd_get_thread_id( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id) { - if (kind != ompd_thread_id_pthread && kind != ompd_thread_id_cudalogical) + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id) { + if (kind != OMPD_THREAD_ID_PTHREAD && kind != OMPD_THREAD_ID_CUDALOGICAL) return ompd_rc_bad_input; if (!thread_handle) return ompd_rc_stale_handle; @@ -797,7 +797,7 @@ ompd_rc_t ompd_get_thread_id( return ompd_rc_stale_handle; ompd_rc_t ret; - if (kind != ompd_thread_id_cudalogical) { + if (kind != OMPD_THREAD_ID_CUDALOGICAL) { ret = ompd_rc_unsupported; } else { ompd_size_t size; @@ -844,7 +844,7 @@ ompd_rc_t ompd_get_state( ompd_rc_t ret; assert(callbacks && "Callback table not initialized!"); - if (thread_handle->ah->kind == ompd_device_kind_cuda) { + if (thread_handle->ah->kind == OMP_DEVICE_KIND_HOST) { if (wait_id) *wait_id = 0; //TODO: (mr) implement wait_ids in nvptx device rtl ret = TValue(context, thread_handle->th) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index cdaeb2f44..355fe09a5 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -79,8 +79,8 @@ typedef struct _ompd_process_handle_s : public ompdAllocatable { typedef struct _ompd_address_space_handle_s : public ompdAllocatable { ompd_address_space_context_t *context; - ompd_device_kind_t kind; - ompd_device_identifier_t id; + omp_device_t kind; + uint64_t id; } ompd_address_space_handle_t; typedef struct _ompd_device_handle_s : public ompdAllocatable { diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp index 10e67ec31..13d5de20f 100644 --- a/libompd/src/omp-icv.cpp +++ b/libompd/src/omp-icv.cpp @@ -2,13 +2,17 @@ #include "ompd-private.h" #include "TargetValue.h" -#define FOREACH_OMPD_ICV(macro) \ - macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space) \ - macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space) \ - macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread) \ - macro (final_var, "ompd-final-var", ompd_scope_task) \ - macro (implicit_var, "ompd-implicit-var", ompd_scope_task) \ - macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel) \ +#define FOREACH_OMPD_ICV(macro) \ + macro (levels_var, "levels-var", ompd_scope_parallel) \ + macro (active_levels_var, "active-levels-var", ompd_scope_parallel) \ + macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space) \ + macro (max_active_levels_var, "max-active-levels-var", ompd_scope_task) \ + macro (bind_var, "bind-var", ompd_scope_task) \ + macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space) \ + macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread) \ + macro (final_var, "ompd-final-var", ompd_scope_task) \ + macro (implicit_var, "ompd-implicit-var", ompd_scope_task) \ + macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel) \ enum ompd_icv { @@ -41,10 +45,10 @@ ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, if (!handle) { return ompd_rc_stale_handle; } - if (handle->kind == ompd_device_kind_cuda) { + if (handle->kind == OMP_DEVICE_KIND_CUDA) { return ompd_rc_unsupported; } - if (current + 2 >= ompd_icv_after_last_icv) { + if (current + 1 >= ompd_icv_after_last_icv) { return ompd_rc_bad_input; } @@ -62,6 +66,53 @@ ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, } +static ompd_rc_t ompd_get_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: nesting level */ + ) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + uint32_t res; + + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_level") /*t.t_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; +} + +static ompd_rc_t ompd_get_active_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: active nesting level */ + ) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + uint32_t res; + + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_active_level") /*t.t_active_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; +} + + static ompd_rc_t ompd_get_num_procs(ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ @@ -156,6 +207,88 @@ ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ return ret; } +static ompd_rc_t +ompd_get_max_active_levels( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = + TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_icvs") // td->td_icvs + .cast("kmp_internal_control_t", 0) + .access("max_active_levels") // td->td_icvs.max_active_levels + .castBase() + .getValue(*val); + + return ret; +} + +static ompd_rc_t +ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *kind, /* OUT: Kind of OpenMP schedule*/ + ompd_word_t *modifier /* OUT: Schedunling modifier */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + TValue sched = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_icvs") // td->td_icvs + .cast("kmp_internal_control_t", 0) + .access("sched") // td->td_icvs.sched + .cast("kmp_r_sched_t", 0); + + ompd_rc_t ret = sched + .access("r_sched_type") // td->td_icvs.sched.r_sched_type + .castBase() + .getValue(*kind); + if (ret != ompd_rc_ok) + return ret; + ret = sched + .access("chunk") // td->td_icvs.sched.r_sched_type + .castBase() + .getValue(*modifier); + return ret; +} + +static ompd_rc_t +ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *bind /* OUT: Kind of proc-binding */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_icvs") // td->td_icvs + .cast("kmp_internal_control_t", 0) + .access("proc_bind") // td->td_icvs.proc_bind + .castBase() + .getValue(*bind); + + return ret; +} + + static ompd_rc_t ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ ompd_word_t *val /* OUT: max number of threads */ @@ -221,8 +354,16 @@ ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, } switch (icv_id) { + case ompd_icv_levels_var: + return ompd_get_level((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_active_levels_var: + return ompd_get_active_level((ompd_parallel_handle_t *)handle, icv_value); case ompd_icv_thread_limit_var: return ompd_get_thread_limit((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_max_active_levels_var: + return ompd_get_max_active_levels((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_bind_var: + return ompd_get_proc_bind((ompd_task_handle_t*)handle, icv_value); case ompd_icv_num_procs_var: return ompd_get_num_procs((ompd_address_space_handle_t*)handle, icv_value); case ompd_icv_thread_num_var: diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp index dd7a3e8e9..82b31fa48 100644 --- a/libompd/src/omp-state.cpp +++ b/libompd/src/omp-state.cpp @@ -31,7 +31,7 @@ ompd_rc_t ompd_enumerate_states( ompd_word_t current_state, ompd_word_t *next_state, const char **next_state_name, ompd_word_t *more_enums) { ompd_rc_t ret; - if (address_space_handle->kind == ompd_device_kind_cuda) { + if (address_space_handle->kind == OMP_DEVICE_KIND_CUDA) { // We only support a small number of states for cuda devices *more_enums = 1; switch (current_state) { diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h index 52e3eb3bb..6283aa33a 100644 --- a/libompd/src/ompd-private.h +++ b/libompd/src/ompd-private.h @@ -93,25 +93,6 @@ typedef enum ompd_target_prim_types_t { ompd_type_max } ompd_target_prim_types_t; -/** - * Logical coordinates of OMP target device threads - */ -typedef struct ompd_dim3_t { - ompd_word_t x; - ompd_word_t y; - ompd_word_t z; -} ompd_dim3_t; +#include "ompd_types.h" -typedef struct ompd_cudathread_coord_t { - ompd_addr_t cudaDevId; - ompd_addr_t cudaContext; - ompd_addr_t warpSize; - ompd_addr_t gridId; - ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work - // with grids too well. - ompd_dim3_t gridDim; - ompd_dim3_t blockDim; - ompd_dim3_t blockIdx; - ompd_dim3_t threadIdx; -} ompd_cudathread_coord_t; #endif /*SRC_OMPD_PRIVATE_H*/ diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 57531d111..36588c0f6 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -56,12 +56,9 @@ typedef struct ompd_address_t { ompd_addr_t address; /* target address in the segment */ } ompd_address_t; -typedef uint64_t ompd_device_identifier_t; +typedef uint64_t omp_device_t; -typedef enum ompd_device_kind_t { - ompd_device_kind_host = 1, - ompd_device_kind_cuda = 2 -} ompd_device_kind_t; +typedef uint64_t ompd_thread_id_t; /** * Context handle. @@ -90,22 +87,6 @@ typedef struct _ompd_parallel_handle_s ompd_parallel_handle_t; typedef struct _ompd_task_handle_s ompd_task_handle_t; typedef struct _ompd_address_space_handle_s ompd_address_space_handle_t; -/** - * Other handles. - */ -#define OMPD_THREAD_ID_PTHREAD 0 -#define OMPD_THREAD_ID_LWP 1 -#define OMPD_THREAD_ID_WINTHREAD 2 -#define OMPD_THREAD_ID_CUDALOGICAL 3 -#define OMPD_THREAD_ID_MAX 4 - -typedef enum ompd_thread_id_kind_t { - ompd_thread_id_pthread = 0, - ompd_thread_id_lwp = 1, - ompd_thread_id_winthread = 2, - ompd_thread_id_cudalogical = 3 -} ompd_thread_id_kind_t; - /** * Scope for ICVs */ @@ -181,7 +162,7 @@ typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)( * Get thread specific context. */ typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)( - ompd_address_space_context_t *context, ompd_thread_id_kind_t kind, + ompd_address_space_context_t *context, ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, const void *thread_id, ompd_thread_context_t **thread_context); @@ -337,19 +318,6 @@ ompd_rc_t ompd_finalize(void); /* --- 4.1 Thread Handles --------------------------------------------------- */ -/** - * Retrieve handles for all OpenMP threads. - * - * The ompd_get_threads operation enables the debugger to obtain handles for all - * OpenMP threads. A successful invocation of ompd_get_threads returns a pointer - * to a vector of handles in thread_handle_array and returns the number of - * handles in num_handles. This call yields meaningful results only if all - * OpenMP threads are stopped; otherwise, the OpenMP runtime may be creating - * and/or destroying threads during or after the call, rendering useless the - * vector of handles returned. - */ - - /** * Retrieve handles for OpenMP threads in a parallel region. * @@ -495,7 +463,7 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, ompd_rc_t ompd_get_thread_handle( ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, const void *thread_id, ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ ); @@ -507,7 +475,7 @@ ompd_rc_t ompd_get_thread_handle( */ ompd_rc_t ompd_get_thread_id( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id); + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id); /* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */ diff --git a/libompd/src/ompd_types.h b/libompd/src/ompd_types.h new file mode 100644 index 000000000..ff66e79c5 --- /dev/null +++ b/libompd/src/ompd_types.h @@ -0,0 +1,45 @@ +#ifndef OMPD_TYPES_H_ +#define OMPD_TYPES_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +// Values for omp_device_kind +#define OMP_DEVICE_KIND_HOST 1 +#define OMP_DEVICE_KIND_CUDA 2 + +// Values for ompd_thread_id_t +#define OMPD_THREAD_ID_PTHREAD 0 +#define OMPD_THREAD_ID_LWP 1 +#define OMPD_THREAD_ID_WINTHREAD 2 +#define OMPD_THREAD_ID_CUDALOGICAL 3 +#define OMPD_THREAD_ID_MAX 4 + +/** + * Logical coordinates of OMP target device threads + */ +typedef struct ompd_dim3_t { + ompd_word_t x; + ompd_word_t y; + ompd_word_t z; +} ompd_dim3_t; + +typedef struct ompd_cudathread_coord_t { + ompd_addr_t cudaDevId; + ompd_addr_t cudaContext; + ompd_addr_t warpSize; + ompd_addr_t gridId; + ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work + // with grids too well. + ompd_dim3_t gridDim; + ompd_dim3_t blockDim; + ompd_dim3_t blockIdx; + ompd_dim3_t threadIdx; +} ompd_cudathread_coord_t; + +#ifdef __cplusplus +} +#endif + +#endif /*OMPD_TYPES_H_*/ From 528b3ede81cc042fb4128ad992544b0d6162c16b Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 16 Jul 2018 09:19:15 -0700 Subject: [PATCH 16/64] [OMPD] Align ompd.h formatting to spec --- libompd/src/omp-debug.cpp | 6 +- libompd/src/ompd.h | 161 +++++++++++++++++++------------------- 2 files changed, 84 insertions(+), 83 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index c48d5a354..fe5c6eec2 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -66,9 +66,9 @@ ompd_process_initialize(ompd_address_space_context_t } ompd_rc_t -ompd_get_openmp_version(ompd_address_space_handle_t +ompd_get_omp_version(ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_word_t *version) { + ompd_word_t *version) { if (!addr_handle) return ompd_rc_stale_handle; ompd_address_space_context_t *context = addr_handle->context; @@ -85,7 +85,7 @@ ompd_get_openmp_version(ompd_address_space_handle_t return ret; } -ompd_rc_t ompd_get_openmp_version_string( +ompd_rc_t ompd_get_omp_version_string( ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ const char **string) { diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 36588c0f6..0a585ac11 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -56,8 +56,8 @@ typedef struct ompd_address_t { ompd_addr_t address; /* target address in the segment */ } ompd_address_t; +/* types for device and thread id KIND, not for the actual thread/device id */ typedef uint64_t omp_device_t; - typedef uint64_t ompd_thread_id_t; /** @@ -255,12 +255,12 @@ typedef struct ompd_callbacks_t { * Call signatures from the debugger to the OMPD DLL. */ -/* --- 4 Initialization ----------------------------------------------------- */ +/* --- 4.1 Initialization --------------------------------------------------- */ /** * The OMPD function ompd_get_version_string returns a descriptive string * describing an implementation of the OMPD library. The function - * ompd_get_version_compatibility returns an integer code used to indicate the + * ompd_get_api_version returns an integer code used to indicate the * revision of the OMPD specification supported by an implementation of OMPD. */ @@ -281,6 +281,10 @@ ompd_initialize(ompd_word_t version, const ompd_callbacks_t *table /* IN: callbacks table */ ); +ompd_rc_t ompd_finalize(void); + +/* --- 4.2 Per Process Initialization and Finalization ---------------------- */ + ompd_rc_t ompd_process_initialize(ompd_address_space_context_t *context, /* IN: debugger handle for the target */ @@ -288,21 +292,6 @@ ompd_process_initialize(ompd_address_space_context_t *addrhandle /* OUT: ompd handle for the target */ ); -ompd_rc_t -ompd_get_openmp_version(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *version); - -ompd_rc_t ompd_get_openmp_version_string( - ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - const char **string); - -ompd_rc_t ompd_release_address_space_handle( - ompd_address_space_handle_t - *addr_handle /* IN: handle for the address space */ - ); - ompd_rc_t ompd_device_initialize( ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ ompd_address_space_context_t *device_context, @@ -312,11 +301,24 @@ ompd_rc_t ompd_device_initialize( ompd_address_space_handle_t **device_handle ); -ompd_rc_t ompd_finalize(void); +ompd_rc_t ompd_release_address_space_handle( + ompd_address_space_handle_t + *addr_handle /* IN: handle for the address space */ + ); + +/* --- 4.4 Address Space Information ---------------------------------------- */ + +ompd_rc_t +ompd_get_omp_version(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *version); -/* --- 4 Handle Management -------------------------------------------------- */ +ompd_rc_t ompd_get_omp_version_string( + ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + const char **string); -/* --- 4.1 Thread Handles --------------------------------------------------- */ +/* --- 4.5 Thread Handles --------------------------------------------------- */ /** * Retrieve handles for OpenMP threads in a parallel region. @@ -336,13 +338,38 @@ ompd_rc_t ompd_get_thread_in_parallel( ompd_thread_handle_t **thread_handle /* OUT: handle */ ); +/** + * Obtain an OpenMP thread handle and the internal OS thread handle for the + * selected (context) thread. + * If the function returns ompd_rc_ok then the operating system thread + * corresponds to an OpenMP thread and the thread_handle is initialized. The + * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime + * system. + */ +ompd_rc_t ompd_get_thread_handle( + ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, const void *thread_id, + ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ + ); + ompd_rc_t ompd_release_thread_handle(ompd_thread_handle_t *thread_handle); ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, ompd_thread_handle_t *thread_handle_2, int *cmp_value); -/* --- 4.2 Parallel Region Handles------------------------------------------- */ +/** + * Obtain the OS thread handle for an OpenMP thread handle. + * this might change over time in case virtual openmp threads migrate between + * OS threads. + */ +ompd_rc_t ompd_get_thread_id( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id); + +/* --- 4.6 Parallel Region Handles------------------------------------------- */ /** * Retrieve the handle for the innermost patallel region for an OpenMP thread. @@ -395,7 +422,7 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, ompd_parallel_handle_t *parallel_handle_2, int *cmp_value); -/* --- 4.3 Task Handles ----------------------------------------------------- */ +/* --- 4.7 Task Handles ----------------------------------------------------- */ /** * Retrieve the handle for the innermost task for an OpenMP thread. @@ -449,61 +476,11 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, ompd_task_handle_t *task_handle_2, int *cmp_value); -/* --- 7 Thread Inquiry ----------------------------------------------------- */ -/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ - -/** - * Obtain an OpenMP thread handle and the internal OS thread handle for the - * selected (context) thread. - * If the function returns ompd_rc_ok then the operating system thread - * corresponds to an OpenMP thread and the thread_handle is initialized. The - * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime - * system. - */ -ompd_rc_t ompd_get_thread_handle( - ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_t kind, - ompd_size_t sizeof_thread_id, const void *thread_id, - ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ - ); - -/** - * Obtain the OS thread handle for an OpenMP thread handle. - * this might change over time in case virtual openmp threads migrate between - * OS threads. - */ -ompd_rc_t ompd_get_thread_id( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id); - -/* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */ - -/** - * Get the state of a thread. This can use OMPT state data structure to define - * different states of threads (e.g., idle, working, or barrier, etc) and what - * entity cased this state (e.g., address of a lock); - * - * The function ompd_get_state is a third-party version of ompt_get_state. The - * only difference between the OMPD and OMPT counterparts is that the OMPD - * version must supply a thread handle to provide a context for this inquiry. - */ -ompd_rc_t ompd_enumerate_states ( - ompd_address_space_handle_t *address_space_handle, - ompd_word_t current_state, - ompd_word_t *next_state, - const char **next_state_name, - ompd_word_t *more_enums - ); - -ompd_rc_t ompd_get_state( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *state, /* OUT: State of this thread */ - ompd_wait_id_t *wait_id /* OUT: Wait ID */ - ); - -/* --- 8 Task Inquiry ------------------------------------------------------- */ -/* --- 8.3 OMPT Task Inquiry Analogues -------------------------------------- */ +/* +ompd_rc_t ompd_get_task_function( + ompd_task_handle_t *task_handle, + ompd_address_t *entry_point); +*/ /** * The functions defined here are third-party versions of ompt_get_task_frame @@ -535,7 +512,31 @@ ompd_rc_t ompd_get_task_frame( ompd_address_t *sp_reentry /* OUT: previous frame is user code */ ); -/* --- 13 Display Control Variables ----------------------------------------- */ + +/** + * Get the state of a thread. This can use OMPT state data structure to define + * different states of threads (e.g., idle, working, or barrier, etc) and what + * entity cased this state (e.g., address of a lock); + * + * The function ompd_get_state is a third-party version of ompt_get_state. The + * only difference between the OMPD and OMPT counterparts is that the OMPD + * version must supply a thread handle to provide a context for this inquiry. + */ +ompd_rc_t ompd_enumerate_states ( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, + ompd_word_t *next_state, + const char **next_state_name, + ompd_word_t *more_enums + ); + +ompd_rc_t ompd_get_state( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *state, /* OUT: State of this thread */ + ompd_wait_id_t *wait_id /* OUT: Wait ID */ + ); + +/* --- 4.8 Display Control Variables ---------------------------------------- */ /** * Using the ompd_display_control_vars function, the debugger can extract a @@ -559,7 +560,7 @@ ompd_rc_t ompd_release_display_control_vars( const char *const **control_var_values /* IN */ ); -/* --- Internal Control Variables ------------------------------------------- */ +/* --- 4.9 Internal Control Variables --------------------------------------- */ ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, ompd_icv_id_t current, From c612600247d8227908133528cfc37ed16712f3c2 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 16 Jul 2018 10:41:18 -0700 Subject: [PATCH 17/64] [OMPD] changed some comments for current spec --- libompd/src/omp-debug.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index fe5c6eec2..90d567789 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -25,11 +25,10 @@ const ompd_callbacks_t *callbacks = nullptr; ompd_device_type_sizes_t type_sizes; -uint64_t ompd_state; /* --- OMPD functions ------------------------------------------------------- */ -/* --- 3 Initialization ----------------------------------------------------- */ +/* --- 1 Initialization ----------------------------------------------------- */ ompd_rc_t ompd_initialize(ompd_word_t version, const ompd_callbacks_t *table) { ompd_rc_t ret = table ? ompd_rc_ok : ompd_rc_bad_input; @@ -158,10 +157,7 @@ ompd_rc_t ompd_device_initialize( return ompd_rc_unavailable; } - -/* --- 4 Handle Management -------------------------------------------------- */ - -/* --- 4.1 Thread Handles --------------------------------------------------- */ +/* --- 4.5 Thread Handles --------------------------------------------------- */ /* thread_handle is of type (kmp_base_info_t) */ @@ -227,7 +223,7 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, return ompd_rc_ok; } -/* --- 4.2 Parallel Region Handles------------------------------------------- */ +/* --- 4.6 Parallel Region Handles------------------------------------------- */ /* parallel_handle is of type (kmp_base_team_t)*/ @@ -422,7 +418,7 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, return ompd_rc_ok; } -/* --- 4.3 Task Handles ----------------------------------------------------- */ +/* --- 4.7 Task Handles ----------------------------------------------------- */ /* task_handle is of type (kmp_taskdata_t) */ @@ -1011,7 +1007,7 @@ ompd_rc_t ompd_get_task_function( } #endif -/* --- 9 OMPD Version and Compatibility Information ------------------------- */ +/* --- --- OMPD Version and Compatibility Information ----------------------- */ ompd_rc_t ompd_get_api_version(ompd_word_t *version) { *version = OMPD_VERSION; @@ -1029,7 +1025,7 @@ ompd_get_api_version_string(const char **string /* OUT: OMPD version string */ return ompd_rc_ok; } -/* --- 12 Display Control Variables ----------------------------------------- */ +/* --- 4.8 Display Control Variables ---------------------------------------- */ ompd_rc_t ompd_get_display_control_vars(ompd_address_space_handle_t *handle, From 5dd42cc0256600016c40fa441e06cd43dda51913 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 17 Jul 2018 08:36:53 -0700 Subject: [PATCH 18/64] [OMPD] Some cleanup in gdb-wrapper --- libompd/gdb-wrapper/OMPDCommand.cpp | 70 +++++++++++------------------ libompd/gdb-wrapper/OMPDCommand.h | 36 ++------------- 2 files changed, 30 insertions(+), 76 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 2e364779c..1e45491f9 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -159,7 +159,7 @@ OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& e else if (strcmp(str, "threads") == 0) return new OMPDThreads(functions, addrhandle, extraArgs); else if (strcmp(str, "levels") == 0) - return new OMPDLevels(functions, addrhandle, extraArgs); + return new OMPDLevels(functions, addrhandle, icvs, extraArgs); else if (strcmp(str, "callback") == 0) return new OMPDCallback(functions, addrhandle, extraArgs); else if (strcmp(str, "api") == 0) @@ -365,30 +365,28 @@ const char* OMPDThreads::toString() const void OMPDLevels::execute() const { -/* ompd_size_t num_os_threads; - ompd_rc_t ret = CB_num_os_threads(contextPool->getGlobalOmpdContext(), &num_os_threads); - assert(ret==ompd_rc_ok && "Error calling OMPD!"); - ompd_osthread_t* osThreads = (ompd_osthread_t*) - malloc(sizeof(ompd_osthread_t)*num_os_threads); - ret = CB_get_os_threads (contextPool->getGlobalOmpdContext(), &num_os_threads, &osThreads); - assert(ret==ompd_rc_ok && "Error calling OMPD!"); - + ompd_rc_t ret; printf("\n"); printf("Thread_handle Nesting_level\n"); printf("-------------------------------\n"); - for (size_t i=0; i < num_os_threads; ++i) + for (auto i: getThreadIDsFromDebugger()) { - ompd_thread_handle_t thread_handle; + ompd_thread_handle_t *thread_handle; + ompd_parallel_handle_t *parallel_handle; ret = functions->ompd_get_thread_handle( - contextPool->getGlobalOmpdContext(), &(osThreads[i]), &thread_handle); + addrhandle, ompd_thread_id_pthread, sizeof(i.second) ,&(i.second), &thread_handle); + if (ret != ompd_rc_ok) { + continue; + } + ret = functions->ompd_get_current_parallel_handle(thread_handle, + ¶llel_handle); if (ret == ompd_rc_ok) { - ompd_tword_t level=0; - ret = functions->ompd_nesting_level( - contextPool->getGlobalOmpdContext(), &thread_handle, &level); - printf("%-12u %ld\n", (unsigned int)thread_handle, level); + ompd_word_t level=0; + icvs->get(parallel_handle, "levels-var", &level); + printf("%-12p %ld\n", thread_handle, level); } - }*/ + } } const char* OMPDLevels::toString() const @@ -731,35 +729,21 @@ const char* OMPDTest::toString() const void OMPDParallelRegions::execute() const { ompd_rc_t ret; - vector host_thread_handles; - // get all thread handles - auto thread_ids = getThreadIDsFromDebugger(); - for(auto i: thread_ids) { - ompd_thread_handle_t* thread_handle; - ret = functions->ompd_get_thread_handle( - addrhandle, ompd_thread_id_pthread, sizeof(i.second), - &(i.second), &thread_handle); - if (ret == ompd_rc_ok) - { - host_thread_handles.push_back(thread_handle); - } - } + auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); // get parallel handles for thread handles ParallelMap host_parallel_handles; for (auto t: host_thread_handles) { - ompd_parallel_handle_t *parallel_handle; - ret = functions->ompd_get_current_parallel_handle(t, ¶llel_handle); - if (ret != ompd_rc_ok) { - continue; - } - ompd_parallel_handle_t *key = parallel_handle_in_map(parallel_handle, - host_parallel_handles); - if (key) { - host_parallel_handles[key].push_back(t); - functions->ompd_release_parallel_handle(parallel_handle); - } else { - host_parallel_handles[parallel_handle].push_back(t); + for (auto parallel_handle: odbGetParallelRegions(functions, t)) + { + ompd_parallel_handle_t *key = parallel_handle_in_map( + parallel_handle, host_parallel_handles); + if (key) { + host_parallel_handles[key].push_back(t); + functions->ompd_release_parallel_handle(parallel_handle); + } else { + host_parallel_handles[parallel_handle].push_back(t); + } } } @@ -771,7 +755,7 @@ void OMPDParallelRegions::execute() const icvs->get(p.first, "ompd-team-size-var", &icv_num_threads); icvs->get(p.first, "levels-var", &icv_level); icvs->get(p.first, "active-levels-var", &icv_active_level); - printf("%-15p %-10zu %-15llu %-9llu %llu\n", p.first, p.second.size(), icv_num_threads, icv_level, icv_active_level); + printf("%-15p %-10zu %-15ld %-9ld %ld\n", p.first, p.second.size(), icv_num_threads, icv_level, icv_active_level); } for (auto t: host_thread_handles) { diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index ac1da08ce..3423fe3b2 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -58,52 +58,26 @@ macro(ompd_device_initialize) \ macro(ompd_release_address_space_handle) \ macro(ompd_initialize) \ macro(ompd_finalize) \ -/*macro(ompd_get_threads) */\ macro(ompd_get_thread_in_parallel) \ macro(ompd_release_thread_handle) \ macro(ompd_thread_handle_compare) \ macro(ompd_get_thread_id) \ -/*macro(ompd_get_top_parallel_region)*/ \ macro(ompd_get_current_parallel_handle) \ macro(ompd_get_enclosing_parallel_handle) \ -/*macro(ompd_get_task_enclosing_parallel_handle) */\ macro(ompd_release_parallel_handle) \ macro(ompd_parallel_handle_compare) \ -/*macro(ompd_get_top_task_region) \ -macro(ompd_get_ancestor_task_region) \ -macro(ompd_get_implicit_task_in_parallel) */\ macro(ompd_get_current_task_handle) \ macro(ompd_get_generating_task_handle) \ -/*macro(ompd_get_scheduling_task_handle)*/ \ macro(ompd_get_task_in_parallel) \ macro(ompd_release_task_handle) \ macro(ompd_task_handle_compare) \ -/*macro(ompd_get_num_procs) \ -macro(ompd_get_thread_limit) \ -macro(ompd_get_num_threads) \ -macro(ompd_get_level) \ -macro(ompd_get_active_level) \ -macro(ompd_get_parallel_id) \ -macro(ompd_get_parallel_function) */\ macro(ompd_get_thread_handle) \ -/*macro(ompd_get_osthread)*/ \ macro(ompd_enumerate_states) \ macro(ompd_get_state) \ -/*macro(ompd_get_max_threads) \ -macro(ompd_get_thread_num) \ -macro(ompd_in_parallel) \ -macro(ompd_in_final) \ -macro(ompd_get_dynamic) \ -macro(ompd_get_nested) \ -macro(ompd_get_max_active_levels) \ -macro(ompd_get_schedule) \ -macro(ompd_get_proc_bind)*/ \ macro(ompd_get_task_frame) \ -/*macro(ompd_get_task_id) */\ macro(ompd_get_api_version) \ macro(ompd_enumerate_icvs) \ macro(ompd_get_icv_from_scope) \ -/*macro(ompd_get_version_string) \*/ namespace ompd_gdb { @@ -129,12 +103,6 @@ typedef struct FOREACH_OMPD_API_FN(OMPD_API_FUNCTION_POINTER_MEMBER) #undef OMPD_API_FUNCTION_POINTER_MEMBER -/* ompd_rc_t (*ompd_initialize) (ompd_callbacks_t *) = nullptr; - ompd_get_thread_handle_fn_t ompd_get_thread_handle = nullptr; - ompd_nesting_level_fn_t ompd_nesting_level = nullptr; - ompd_read_tmemory_fn_t ompd_read_tmemory = nullptr; -*/ - } OMPDFunctions; typedef std::shared_ptr OMPDFunctionsPtr; @@ -252,12 +220,14 @@ class OMPDThreads : public OMPDCommand class OMPDLevels : public OMPDCommand { + OMPDIcvsPtr icvs; public: ~OMPDLevels(){}; void execute() const; const char* toString() const; protected: - OMPDLevels(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const std::vector& args) : OMPDCommand(f, ah, args){}; + OMPDLevels(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const OMPDIcvsPtr &icvs, const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {}; friend OMPDCommandFactory; }; From 3473deb931bdf064a3ed8e08ff31df5277821eae Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 17 Jul 2018 09:05:44 -0700 Subject: [PATCH 19/64] [OMPD] fix omp version --- libompd/src/omp-debug.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index 355fe09a5..18e693a00 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -20,10 +20,10 @@ extern "C" { #endif -#define OMPD_IMPLEMENTS_OPENMP 3 -#define OMPD_IMPLEMENTS_OPENMP_SUBVERSION 1 +#define OMPD_IMPLEMENTS_OPENMP 5 +#define OMPD_IMPLEMENTS_OPENMP_SUBVERSION 0 #define OMPD_TR_VERSION 6 -#define OMPD_TR_SUBVERSION 'j' +#define OMPD_TR_SUBVERSION 2 #define OMPD_VERSION \ (OMPD_IMPLEMENTS_OPENMP << 24) + (OMPD_IMPLEMENTS_OPENMP_SUBVERSION << 16) + \ (OMPD_TR_VERSION << 8) + OMPD_TR_SUBVERSION From 64b73b0f20160391ce005726066e5356c35c6b24 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 17 Jul 2018 15:43:42 -0700 Subject: [PATCH 20/64] [OMPD] Add type compatibillity for new spec to odb --- libompd/gdb-wrapper/Callbacks.cpp | 4 ++-- libompd/gdb-wrapper/Callbacks.h | 2 +- libompd/gdb-wrapper/ompd_typedefs.h | 8 +++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/libompd/gdb-wrapper/Callbacks.cpp b/libompd/gdb-wrapper/Callbacks.cpp index ad739c3e5..77c91ec1b 100644 --- a/libompd/gdb-wrapper/Callbacks.cpp +++ b/libompd/gdb-wrapper/Callbacks.cpp @@ -77,14 +77,14 @@ ompd_rc_t CB_dmemory_free ( ompd_rc_t CB_thread_context ( ompd_address_space_context_t *context, - ompd_thread_id_kind_t kind, + ompd_thread_id_t kind, ompd_size_t sizeof_osthread, const void* osthread, ompd_thread_context_t **tcontext ) { ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle; - if (kind == ompd_thread_id_cudalogical) { + if (kind == OMPD_THREAD_ID_CUDALOGICAL) { *tcontext = ((OMPDContext*)context)->getContextForThread((CudaThread*)osthread); } else { diff --git a/libompd/gdb-wrapper/Callbacks.h b/libompd/gdb-wrapper/Callbacks.h index 040c7819e..3e8f379be 100644 --- a/libompd/gdb-wrapper/Callbacks.h +++ b/libompd/gdb-wrapper/Callbacks.h @@ -48,7 +48,7 @@ ompd_rc_t CB_dmemory_free ( ompd_rc_t CB_thread_context ( ompd_address_space_context_t *context, - ompd_thread_id_kind_t kind, + ompd_thread_id_t kind, ompd_size_t sizeof_osthread, const void* osthread, ompd_thread_context_t **tcontext); diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h index 28fc39d57..837b943f8 100644 --- a/libompd/gdb-wrapper/ompd_typedefs.h +++ b/libompd/gdb-wrapper/ompd_typedefs.h @@ -1,13 +1,15 @@ #include "ompd.h" -/* this should be somewhere else*/ -typedef uint64_t omp_device_t; -typedef ompd_thread_id_kind_t ompd_thread_id_t; /* 4.3.4.1 * Global initialization and finalization */ +// TODO: (mr) I dont have time to change every thread id kind, so this is some compat stuff +#define ompd_thread_id_pthread OMPD_THREAD_ID_PTHREAD +#define ompd_thread_id_cudalogical OMPD_THREAD_ID_CUDALOGICAL +#define ompd_device_kind_cuda OMP_DEVICE_KIND_CUDA + typedef ompd_rc_t (*ompd_initialize_fn_t) ( ompd_word_t api_version, const ompd_callbacks_t *callbacks From 43d1e58008fcb08a3458154cf698aabdcce459c5 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 17 Jul 2018 15:48:12 -0700 Subject: [PATCH 21/64] [OMPD] Code clean-up in odb --- libompd/gdb-wrapper/OMPDCommand.cpp | 34 +++++++++-------------------- libompd/gdb-wrapper/OMPDCommand.h | 5 +---- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 1e45491f9..3643f81c2 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -732,18 +732,19 @@ void OMPDParallelRegions::execute() const auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); // get parallel handles for thread handles - ParallelMap host_parallel_handles; + //ParallelMap host_parallel_handles; + auto cmp_fn = [this](const ompd_parallel_handle_t *a, const ompd_parallel_handle_t *b){ + int cmp = 0; + this->functions->ompd_parallel_handle_compare((ompd_parallel_handle_t*)a, (ompd_parallel_handle_t*)b, &cmp); + return cmp < 0; + }; + std::map, + decltype(cmp_fn)> host_parallel_handles(cmp_fn); for (auto t: host_thread_handles) { for (auto parallel_handle: odbGetParallelRegions(functions, t)) { - ompd_parallel_handle_t *key = parallel_handle_in_map( - parallel_handle, host_parallel_handles); - if (key) { - host_parallel_handles[key].push_back(t); - functions->ompd_release_parallel_handle(parallel_handle); - } else { - host_parallel_handles[parallel_handle].push_back(t); - } + host_parallel_handles[parallel_handle].push_back(t); } } @@ -770,18 +771,3 @@ const char *OMPDParallelRegions::toString() const { return "odb parallel"; } - - -ompd_parallel_handle_t *OMPDParallelRegions::parallel_handle_in_map(ompd_parallel_handle_t *handle, - std::map> parallel_handles) const -{ - for (ParallelMap::const_iterator iter = parallel_handles.cbegin(); - iter != parallel_handles.cend(); iter++) { - int cmp; - functions->ompd_parallel_handle_compare(iter->first, handle, &cmp); - if (!cmp) { - return iter->first; - } - } - return NULL; -} diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index 3423fe3b2..b8414bdef 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -270,7 +270,6 @@ class OMPDTest : public OMPDCommand class OMPDParallelRegions : public OMPDCommand { - typedef std::map> ParallelMap; public: ~OMPDParallelRegions() {}; void execute() const; @@ -279,13 +278,11 @@ class OMPDParallelRegions : public OMPDCommand OMPDParallelRegions(const OMPDFunctionsPtr &f, ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs, const std::vector& args) - : OMPDCommand(f, ah, args), icvs(icvs) {}; + : OMPDCommand(f, ah, args), icvs(icvs) {} friend OMPDCommandFactory; private: OMPDIcvsPtr icvs; - ompd_parallel_handle_t *parallel_handle_in_map(ompd_parallel_handle_t *handle, - std::map>) const; }; } From 66775c1bf481c871ab074eeaf7ee88d8079d6a0b Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 17 Jul 2018 16:17:09 -0700 Subject: [PATCH 22/64] [OMPD] Fix bug introd. by type changes f. new spec --- libompd/src/omp-debug.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 90d567789..d5d841e67 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -840,7 +840,7 @@ ompd_rc_t ompd_get_state( ompd_rc_t ret; assert(callbacks && "Callback table not initialized!"); - if (thread_handle->ah->kind == OMP_DEVICE_KIND_HOST) { + if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { if (wait_id) *wait_id = 0; //TODO: (mr) implement wait_ids in nvptx device rtl ret = TValue(context, thread_handle->th) From bf0ea8ab7b5ce14dc2f728abcb11c6be84f356ba Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 18 Jul 2018 13:56:44 -0700 Subject: [PATCH 23/64] [OMPD] Add "tasks" command to odb --- libompd/gdb-wrapper/OMPDCommand.cpp | 61 ++++++++++++++++++++++++----- libompd/gdb-wrapper/OMPDCommand.h | 45 ++++++++++++++++++++- libompd/src/omp-debug.cpp | 2 +- 3 files changed, 97 insertions(+), 11 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 3643f81c2..5a877b6c4 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -168,6 +168,8 @@ OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& e return new OMPDTest(functions, addrhandle, extraArgs); else if (strcmp(str, "parallel") == 0) return new OMPDParallelRegions(functions, addrhandle, icvs, extraArgs); + else if (strcmp(str, "tasks") == 0) + return new OMPDTasks(functions, addrhandle, icvs, extraArgs); return new OMPDNull; } @@ -731,16 +733,10 @@ void OMPDParallelRegions::execute() const ompd_rc_t ret; auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); - // get parallel handles for thread handles - //ParallelMap host_parallel_handles; - auto cmp_fn = [this](const ompd_parallel_handle_t *a, const ompd_parallel_handle_t *b){ - int cmp = 0; - this->functions->ompd_parallel_handle_compare((ompd_parallel_handle_t*)a, (ompd_parallel_handle_t*)b, &cmp); - return cmp < 0; - }; + OMPDParallelHandleCmp parallel_cmp_op(functions); std::map, - decltype(cmp_fn)> host_parallel_handles(cmp_fn); + std::vector, + OMPDParallelHandleCmp> host_parallel_handles(parallel_cmp_op); for (auto t: host_thread_handles) { for (auto parallel_handle: odbGetParallelRegions(functions, t)) { @@ -771,3 +767,50 @@ const char *OMPDParallelRegions::toString() const { return "odb parallel"; } + +void OMPDTasks::execute() const +{ + ompd_rc_t ret; + auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); + OMPDTaskHandleCmp task_cmp_op(functions); + std::map, + OMPDTaskHandleCmp> host_task_handles(task_cmp_op); + for (auto t: host_thread_handles) { + for (auto task_handle: odbGetTaskRegions(functions, t)) { + host_task_handles[task_handle].push_back(t); + } + } + + printf("HOST TASKS\n"); + printf("Task Handle Assoc. Threads ICV Level Enter Frame Exit Frame\n"); + printf("-------------------------------------------------------------------\n"); + for (auto th: host_task_handles) { + ompd_parallel_handle_t *ph; + ret = functions->ompd_get_task_parallel_handle(th.first, &ph); + if (ret != ompd_rc_ok) { + printf("could not get parallel handle for nesting\n"); + continue; + } + + ompd_word_t icv_level; + icvs->get(ph, "levels-var", &icv_level); + ompd_address_t enter_frame; + ompd_address_t exit_frame; + ret = functions->ompd_get_task_frame(th.first, &enter_frame, &exit_frame); + printf("%-11p %-14zu %-9ld %-11p %-10p\n", th.first, th.second.size(), icv_level, (void*)enter_frame.address, (void*)exit_frame.address); + } + + for (auto task: host_task_handles) { + functions->ompd_release_task_handle(task.first); + } + + for (auto thread: host_thread_handles) { + functions->ompd_release_thread_handle(thread); + } +} + +const char *OMPDTasks::toString() const +{ + return "odb tasks"; +} diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index b8414bdef..f1718facd 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -36,7 +36,7 @@ /* * The macro is used to create code to register all implemented ompd - * API functions with the CommandFactory + * API functions with the CommandFactory * For new implemented API function just add a new OMPD_DO line */ @@ -64,6 +64,7 @@ macro(ompd_thread_handle_compare) \ macro(ompd_get_thread_id) \ macro(ompd_get_current_parallel_handle) \ macro(ompd_get_enclosing_parallel_handle) \ +macro(ompd_get_task_parallel_handle) \ macro(ompd_release_parallel_handle) \ macro(ompd_parallel_handle_compare) \ macro(ompd_get_current_task_handle) \ @@ -121,6 +122,32 @@ class OMPDIcvs typedef std::shared_ptr OMPDIcvsPtr; +class OMPDParallelHandleCmp +{ + OMPDFunctionsPtr functions; +public: + OMPDParallelHandleCmp(const OMPDFunctionsPtr &f) + : functions(f) {} + bool operator()(ompd_parallel_handle_t *a, ompd_parallel_handle_t *b) { + int cmp = 0; + functions->ompd_parallel_handle_compare(a, b, &cmp); + return cmp < 0; + } +}; + +class OMPDTaskHandleCmp +{ + OMPDFunctionsPtr functions; +public: + OMPDTaskHandleCmp(const OMPDFunctionsPtr &f) + : functions(f) {} + bool operator()(ompd_task_handle_t *a, ompd_task_handle_t *b) { + int cmp = 0; + functions->ompd_task_handle_compare(a, b, &cmp); + return cmp < 0; + } +}; + class OMPDCommand; class OMPDCommandFactory @@ -285,6 +312,22 @@ class OMPDParallelRegions : public OMPDCommand OMPDIcvsPtr icvs; }; +class OMPDTasks : public OMPDCommand +{ +public: + ~OMPDTasks() {} + void execute() const; + const char *toString() const; +protected: + OMPDTasks(const OMPDFunctionsPtr &f, + ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs, + const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {} + friend OMPDCommandFactory; +private: + OMPDIcvsPtr icvs; +}; + } #endif /* GDB_OMPDCOMMAND_H_ */ diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index d5d841e67..4adf6eb94 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -907,7 +907,7 @@ ompd_rc_t ompd_get_task_frame( .access("ompt_task_info") // td->ompt_task_info .cast("ompt_task_info_t") .access("frame") // td->ompd_task_info.frame - .cast("ompt_frame_t", 0); + .cast("omp_frame_t", 0); sp_reentry->segment = OMPD_SEGMENT_UNSPECIFIED; ompd_rc_t ret = frame From 504e7be8387990ddc2523d9032d27c7d53b55f7f Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 19 Jul 2018 10:17:35 -0700 Subject: [PATCH 24/64] [OMPD] Add get_task_function and test code in odb --- libompd/gdb-wrapper/OMPDCommand.cpp | 19 ++++++++++++++++--- libompd/gdb-wrapper/OMPDCommand.h | 1 + libompd/src/TargetValue.cpp | 7 ++++++- libompd/src/omp-debug.cpp | 2 ++ libompd/src/ompd.h | 4 ++-- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 5a877b6c4..7d31a7da7 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -783,8 +783,8 @@ void OMPDTasks::execute() const } printf("HOST TASKS\n"); - printf("Task Handle Assoc. Threads ICV Level Enter Frame Exit Frame\n"); - printf("-------------------------------------------------------------------\n"); + printf("Task Handle Assoc. Threads ICV Level Enter Frame Exit Frame Task function\n"); + printf("-----------------------------------------------------------------------------------\n"); for (auto th: host_task_handles) { ompd_parallel_handle_t *ph; ret = functions->ompd_get_task_parallel_handle(th.first, &ph); @@ -795,10 +795,23 @@ void OMPDTasks::execute() const ompd_word_t icv_level; icvs->get(ph, "levels-var", &icv_level); + ompd_address_t enter_frame; ompd_address_t exit_frame; ret = functions->ompd_get_task_frame(th.first, &enter_frame, &exit_frame); - printf("%-11p %-14zu %-9ld %-11p %-10p\n", th.first, th.second.size(), icv_level, (void*)enter_frame.address, (void*)exit_frame.address); + if (ret != ompd_rc_ok) { + printf("could not get task frame\n"); + continue; + } + + ompd_address_t task_function; + ret = functions->ompd_get_task_function(th.first, &task_function); + if (ret != ompd_rc_ok) { + printf("could not get task entry point\n"); + } + printf("%-11p %-14zu %-9ld %-11p %-10p %p\n", th.first, + th.second.size(), icv_level, (void*)enter_frame.address, + (void*)exit_frame.address, (void*)task_function.address); } for (auto task: host_task_handles) { diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index f1718facd..f6e3a867d 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -75,6 +75,7 @@ macro(ompd_task_handle_compare) \ macro(ompd_get_thread_handle) \ macro(ompd_enumerate_states) \ macro(ompd_get_state) \ +macro(ompd_get_task_function) \ macro(ompd_get_task_frame) \ macro(ompd_get_api_version) \ macro(ompd_enumerate_icvs) \ diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index e81efe918..ecf348823 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -367,7 +367,12 @@ ompd_rc_t TValue::check(const char *bitfieldName, ompd_word_t *isSet) const { TValue TValue::getArrayElement(int elemNumber) const { if (gotError()) return *this; - TValue ret = dereference(); + TValue ret; + if (pointerLevel > 0) { + ret = dereference(); + } else { + ret = *this; + } if (ret.pointerLevel == 0) { ompd_size_t size; ret.errorState.errorCode = type->getSize(&size); diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 08577ee6d..325df0bfa 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -973,8 +973,10 @@ ompd_rc_t ompd_get_task_function( ompd_address_space_context_t *context = task_handle->ah->context; if (!context) return ompd_rc_stale_handle; +#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; +#endif assert(callbacks && "Callback table not initialized!"); diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 0a585ac11..d6ea310c8 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -476,11 +476,11 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, ompd_task_handle_t *task_handle_2, int *cmp_value); -/* + ompd_rc_t ompd_get_task_function( ompd_task_handle_t *task_handle, ompd_address_t *entry_point); -*/ + /** * The functions defined here are third-party versions of ompt_get_task_frame From d208dc17c230b0c69a93c5ccf201e028cdc3444f Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 20 Jul 2018 08:19:16 -0700 Subject: [PATCH 25/64] [OMPD] Fix some formatting --- libompd/gdb-wrapper/OMPDCommand.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 7d31a7da7..61b7dbe45 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -795,7 +795,7 @@ void OMPDTasks::execute() const ompd_word_t icv_level; icvs->get(ph, "levels-var", &icv_level); - + ompd_address_t enter_frame; ompd_address_t exit_frame; ret = functions->ompd_get_task_frame(th.first, &enter_frame, &exit_frame); @@ -803,7 +803,7 @@ void OMPDTasks::execute() const printf("could not get task frame\n"); continue; } - + ompd_address_t task_function; ret = functions->ompd_get_task_function(th.first, &task_function); if (ret != ompd_rc_ok) { From 94c30818099be4c3bfc75f110ea8bbab684f02d9 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 20 Jul 2018 08:23:21 -0700 Subject: [PATCH 26/64] [OMPD] Fix formatting mistakes --- .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index d01830872..530ed512a 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -1,31 +1,31 @@ //===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===// // - // The LLVM Compiler Infrastructure - // - // This file is dual licensed under the MIT and the University of Illinois Open - // Source Licenses. See LICENSE.txt for details. - // - //===----------------------------------------------------------------------===// - // - // This file contains the declarations of all library macros, types, - // and functions. - // - //===----------------------------------------------------------------------===// +// The LLVM Compiler Infrastructure +// +// This file is dual licensed under the MIT and the University of Illinois Open +// Source Licenses. See LICENSE.txt for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// #ifndef __OMPTARGET_NVPTX_H #define __OMPTARGET_NVPTX_H - // std includes +// std includes #include #include #include - // cuda includes +// cuda includes #include #include - // local includes +// local includes #include "counter_group.h" #include "debug.h" // debug #include "interface.h" // interfaces with omp, compiler, and user @@ -36,11 +36,11 @@ #define OMPTARGET_NVPTX_VERSION 1.1 - // used by the library for the interface with the app +// used by the library for the interface with the app #define DISPATCH_FINISHED 0 #define DISPATCH_NOTFINISHED 1 - // used by dynamic scheduling +// used by dynamic scheduling #define FINISHED 0 #define NOT_FINISHED 1 #define LAST_CHUNK 2 @@ -48,9 +48,9 @@ #define BARRIER_COUNTER 0 #define ORDERED_COUNTER 1 - // Macros for Cuda intrinsics - // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. - // Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). +// Macros for Cuda intrinsics +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. +// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask(). #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ @@ -65,7 +65,7 @@ #define __ACTIVEMASK() __ballot(1) #endif - // arguments needed for L0 parallelism only. +// arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { #if OMPD_SUPPORT friend void __device__ ompd_init( void ); From 267a23077fe504b6c1dcccecea744e90c3c1234b Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 23 Jul 2018 08:25:27 -0700 Subject: [PATCH 27/64] [OMPD] Fix linking for gdb-wrapper --- libompd/gdb-wrapper/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/libompd/gdb-wrapper/CMakeLists.txt b/libompd/gdb-wrapper/CMakeLists.txt index 11a4f624c..ec87ef31b 100644 --- a/libompd/gdb-wrapper/CMakeLists.txt +++ b/libompd/gdb-wrapper/CMakeLists.txt @@ -34,14 +34,12 @@ add_executable (odb-bin ${cppfiles} odb.cpp) set_target_properties (odb-bin PROPERTIES OUTPUT_NAME odb) add_library (odb ${cppfiles}) -if (ODB_LINUX) -target_link_libraries (odb-bin dl) -target_link_libraries (odb dl) -endif (ODB_LINUX) +target_link_libraries (odb-bin dl) +target_link_libraries (odb dl) include_directories ( ${CMAKE_CURRENT_SOURCE_DIR} -# ${CMAKE_CURRENT_SOURCE_DIR}/../src/ + ${CMAKE_CURRENT_SOURCE_DIR}/../src/ ${CMAKE_BINARY_DIR}/include ) From 14b0cea783eb820f78db200de2a9e4e3a04f5303 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 23 Jul 2018 08:33:50 -0700 Subject: [PATCH 28/64] [OMPD] Save cuda kernel info in ompd handles --- libompd/src/omp-debug.cpp | 16 ++++++++++++++-- libompd/src/omp-debug.h | 19 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 325df0bfa..676684901 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -260,6 +260,7 @@ ompd_rc_t ompd_get_current_parallel_handle( (*parallel_handle)->ah = thread_handle->ah; (*parallel_handle)->th = taddr; + (*parallel_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info; } else { ompd_address_t taddr, lwt; @@ -723,13 +724,23 @@ ompd_get_thread_handle(ompd_address_space_handle_t if (tId != p->threadIdx.x) return ompd_rc_stale_handle; - ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), - (void **)(thread_handle)); + // allocate both the thread handle and the cuda kernel info in one go + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t) + + sizeof(ompd_cuda_thread_kernel_info_t), + (void **)(thread_handle)); if (ret != ompd_rc_ok) return ret; (*thread_handle)->ah = addr_handle; (*thread_handle)->th = taddr; + (*thread_handle)->cuda_kernel_info = + (ompd_cuda_thread_kernel_info_t*)((*thread_handle) + 1); + + (*thread_handle)->cuda_kernel_info->cudaDevId = p->cudaDevId; + (*thread_handle)->cuda_kernel_info->cudaContext = p->cudaContext; + (*thread_handle)->cuda_kernel_info->warpSize = p->warpSize; + (*thread_handle)->cuda_kernel_info->gridId = p->gridId; + (*thread_handle)->cuda_kernel_info->kernelId = p->kernelId; } else { ret = TValue(context, tcontext, "__kmp_gtid") .castBase("__kmp_gtid") @@ -755,6 +766,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t return ret; (*thread_handle)->ah = addr_handle; (*thread_handle)->th = taddr; + (*thread_handle)->cuda_kernel_info = NULL; #ifndef NDEBUG if (ret != ompd_rc_ok) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index 18e693a00..a642cb315 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -71,6 +71,16 @@ class ompdAllocatable { } }; +// Information shared by all threads in a kernel +// Used to map thread handles to native cuda thread ids +typedef struct _ompd_cuda_thread_kernel_info_s : public ompdAllocatable { + ompd_addr_t cudaDevId; + ompd_addr_t cudaContext; + ompd_addr_t warpSize; + ompd_addr_t gridId; + ompd_addr_t kernelId; +} ompd_cuda_thread_kernel_info_t; + typedef struct _ompd_address_space_context_s ompd_address_space_context_t; typedef struct _ompd_process_handle_s : public ompdAllocatable { @@ -92,18 +102,27 @@ typedef struct _ompd_thread_handle_s : public ompdAllocatable { ompd_address_space_handle_t *ah; ompd_thread_context_t *thread_context; ompd_address_t th; /* target handle */ + ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* only valid for cuda */ } ompd_thread_handle_t; typedef struct _ompd_parallel_handle_s : public ompdAllocatable { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ ompd_address_t lwt; /* lwt handle */ + ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* copied from the thread + used to retrieve this + parallel region handle + */ } ompd_parallel_handle_t; typedef struct _ompd_task_handle_s : public ompdAllocatable { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ ompd_address_t lwt; /* lwt handle */ + ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* copied from the thread + used to retrieve this + parallel region handle + */ } ompd_task_handle_t; #endif From 7a7928108b3ced1ff915b55071ecf1567f854fdd Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 30 Jul 2018 09:31:13 -0700 Subject: [PATCH 29/64] [OMPD] Set correct omp states for all modes (cuda) --- .../deviceRTLs/nvptx/src/ompd-specific.cu | 30 +++++++++++++++---- .../deviceRTLs/nvptx/src/ompd-specific.h | 19 ++++++++---- .../deviceRTLs/nvptx/src/omptarget-nvptx.cu | 9 ++++-- .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 11 ++++--- libomptarget/deviceRTLs/nvptx/src/parallel.cu | 13 ++++---- 5 files changed, 60 insertions(+), 22 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index 22a15ae45..aed0b4e13 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -28,10 +28,6 @@ __device__ __shared__ __device__ void ompd_init ( void ) { - getMyTopTaskDescriptor()->ompd_thread_info.state = omp_state_undefined; - getMyTopTaskDescriptor()->ompd_thread_info.blockIdx_x = blockIdx.x; - getMyTopTaskDescriptor()->ompd_thread_info.threadIdx_x = threadIdx.x; - if (ompd_target_initialized) return; @@ -56,8 +52,30 @@ __device__ void ompd_init ( void ) ompd_target_initialized = 1; } -__device__ void ompd_set_device_thread_state(omp_state_t state) { - getMyTopTaskDescriptor()->ompd_thread_info.state = state; +INLINE void ompd_init_thread(omptarget_nvptx_TaskDescr *currTaskDescr) { + currTaskDescr->ompd_thread_info.blockIdx_x = blockIdx.x; + currTaskDescr->ompd_thread_info.threadIdx_x = threadIdx.x; +} + +__device__ void ompd_set_device_specific_thread_state( + omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state) { + taskDescr->ompd_thread_info.state = state; +} + +__device__ void ompd_set_device_thread_state(omp_state_t state) { + ompd_set_device_specific_thread_state(getMyTopTaskDescriptor(), state); +} + +__device__ void ompd_init_thread_parallel() { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ompd_init_thread(currTaskDescr); + ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_parallel); +} + +__device__ void ompd_init_thread_master() { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ompd_init_thread(currTaskDescr); + ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_serial); } __device__ void ompd_bp_parallel_begin (){ asm (""); } diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 4fb51f08a..226934284 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -21,7 +21,10 @@ extern "C" __device__ void ompd_bp_task_end ( void ); #define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \ OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext, topTaskDescr) \ OMPD_ACCESS(omptarget_nvptx_TaskDescr,ompd_thread_info) \ - OMPD_ACCESS(ompd_nvptx_thread_info_t,state) + OMPD_ACCESS(ompd_nvptx_thread_info_t,state) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,threadIdx_x) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,teamContext) \ + OMPD_ACCESS(omptarget_nvptx_TeamDescr,levelZeroTaskDescr) #define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \ OMPD_SIZEOF(omptarget_nvptx_ThreadPrivateContext)\ @@ -37,19 +40,25 @@ typedef enum { omp_state_work_reduction = 0x002 } omp_state_t; -__device__ void ompd_set_device_thread_state(omp_state_t); +class omptarget_nvptx_TaskDescr; + +__device__ void ompd_init_thread_master(); +__device__ void ompd_set_device_specific_thread_state( + omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state); +__device__ void ompd_set_device_thread_state(omp_state_t state); +__device__ void ompd_init_thread_parallel(); INLINE void ompd_reset_device_thread_state() { - ompd_set_device_thread_state(omp_state_undefined); + ompd_set_device_thread_state(omp_state_work_serial); } typedef struct { uint64_t state; // In the host runtime we use the OMPT state. // Here we need to have our own place to store it. - int blockIdx_x; // Libomptarget should only schedule task in one dimension. + uint16_t blockIdx_x; // Libomptarget should only schedule task in one dimension. // To store a unique identifier for the current thread, we // simply store ThreadIdx.x and BlockIdx.x - int threadIdx_x; + uint16_t threadIdx_x; } ompd_nvptx_thread_info_t; #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index 294647fdd..8a1f3e05d 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -94,6 +94,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { currTaskDescr->ThreadLimit() = ThreadLimit; #ifdef OMPD_SUPPORT ompd_init(); + ompd_init_thread_master(); #endif /*OMPD_SUPPORT*/ } @@ -141,6 +142,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, currTeamDescr.InitTeamDescr(); // init counters (copy start to init) workDescr.CounterGroup().Reset(); +#ifdef OMPD_SUPPORT + ompd_init(); +#endif /*OMPD_SUPPORT*/ } __syncthreads(); @@ -177,8 +181,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; } #ifdef OMPD_SUPPORT - ompd_init(); -#endif /*OMPD_SUPPORT*/ + ompd_init_thread_parallel(); // __kmpc_kernel_parallel() is not called in + // spmd mode +#endif } EXTERN void __kmpc_spmd_kernel_deinit() { diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 530ed512a..899809fb0 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -67,9 +67,6 @@ // arguments needed for L0 parallelism only. class omptarget_nvptx_SharedArgs { -#if OMPD_SUPPORT - friend void __device__ ompd_init( void ); -#endif /* OMPD_SUPPORT */ public: // All these methods must be called by the master thread only. INLINE void Init() { @@ -156,7 +153,10 @@ extern __device__ __shared__ DataSharingStateTy DataSharingState; class omptarget_nvptx_TaskDescr { #if OMPD_SUPPORT friend void __device__ ompd_init( void ); - friend void __device__ ompd_set_device_thread_state(omp_state_t state); + friend INLINE void ompd_init_thread( + omptarget_nvptx_TaskDescr *currTaskDescr); + friend __device__ void ompd_set_device_specific_thread_state( + omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state); #endif /* OMPD_SUPPORT */ public: // methods for flags @@ -258,6 +258,9 @@ class omptarget_nvptx_WorkDescr { //////////////////////////////////////////////////////////////////////////////// class omptarget_nvptx_TeamDescr { +#ifdef OMPD_SUPPORT + friend void __device__ ompd_init( void ); +#endif /*OMPD_SUPPORT*/ public: // access to data INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index b992e8929..85173b5b2 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -304,6 +304,12 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "only team master can create parallel"); +#ifdef OMPD_SUPPORT + // Move the previous thread into undefined state (will be reset in __kmpc_kernel_end_parallel) + // TODO (mr) find a better place to do this + ompd_set_device_thread_state(omp_state_undefined); +#endif /*OMPD_SUPPORT*/ + // set number of threads on work descriptor // this is different from the number of cuda threads required for the parallel // region @@ -359,7 +365,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn, isActive = true; #ifdef OMPD_SUPPORT - ompd_set_device_thread_state(omp_state_work_parallel); + ompd_init_thread_parallel(); #endif /*OMPD_SUPPORT*/ } @@ -413,7 +419,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); #ifdef OMPD_SUPPORT - ompd_set_device_thread_state(omp_state_work_serial); + ompd_init_thread_parallel(); // we are still in a prallel region #endif /*OMPD_SUPPORT*/ } @@ -429,9 +435,6 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, threadId, currTaskDescr->GetPrevTaskDescr()); // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); -#ifdef OMPD_SUPPORT - ompd_set_device_thread_state(omp_state_work_parallel); -#endif /*OMPD_SUPPORT*/ } EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) { From 5bec325bf7f326e5316a5cc91c0099916588d3ef Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 30 Jul 2018 09:33:18 -0700 Subject: [PATCH 30/64] [OMPD] Fix thread handles in all exec modes (cuda) --- libompd/src/TargetValue.cpp | 10 +++++++++ libompd/src/TargetValue.h | 4 ++++ libompd/src/omp-debug.cpp | 43 +++++++++++++++++++++++++++++++------ 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index ecf348823..8f2288257 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -383,6 +383,16 @@ TValue TValue::getArrayElement(int elemNumber) const { return ret; } +TValue TValue::getPtrArrayElement(int elemNumber) const { + if (gotError()) { + return *this; + } + assert(pointerLevel > 0 && "This only works on arrays of pointers"); + TValue ret = *this; + ret.symbolAddr.address += elemNumber * type_sizes.sizeof_pointer; + return ret; +} + TBaseValue::TBaseValue(const TValue &_tvalue, ompd_target_prim_types_t _baseType) : TValue(_tvalue), baseTypeSize(ompd_sizeof(_baseType)) {} diff --git a/libompd/src/TargetValue.h b/libompd/src/TargetValue.h index 40b61a54e..cf14ea716 100644 --- a/libompd/src/TargetValue.h +++ b/libompd/src/TargetValue.h @@ -186,6 +186,10 @@ class TValue { * Get an array element */ TValue getArrayElement(int elemNumber) const; + /** + * Get an element of a pointer arraz + */ + TValue getPtrArrayElement(int elemNumber) const; /** * Did we raise some error yet? */ diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 676684901..73c912cb9 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -697,7 +697,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t if (kind == OMPD_THREAD_ID_CUDALOGICAL) { ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; - // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->items.threadId + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x] TValue th = TValue(context, tcontext, "omptarget_nvptx_threadPrivateContext", OMPD_SEGMENT_CUDA_PTX_SHARED) @@ -706,23 +706,52 @@ ompd_get_thread_handle(ompd_address_space_handle_t .access("topTaskDescr") .cast("omptarget_nvptx_TaskDescr", 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .getArrayElement(p->threadIdx.x); + .getPtrArrayElement(p->threadIdx.x) + .dereference(); ompd_address_t taddr; ret = th.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; + if (ret != ompd_rc_ok) { + if (taddr.address == 0 && p->threadIdx.x % 32 == 0) { + // check for the master task/thread instead + // The master thread should never have the threadIdx.x of zero, so + // checking it this way should be safe + + th = TValue(context, tcontext, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("teamContext") + .cast("omptarget_nvptx_TeamDescr", 0, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("levelZeroTaskDescr"); + + ret = th.getAddress(&taddr); + + if (ret != ompd_rc_ok) + return ret; + } else { + return ret; + } + } - ret = th.access("items__threadId") + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x] + // ->ompd_thread_info.threadIdx_x + ret = th.cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("threadIdx_x") .castBase(ompd_type_short) .getValue(tId); if (ret != ompd_rc_ok) return ret; - if (tId != p->threadIdx.x) - return ompd_rc_stale_handle; + if (tId != p->threadIdx.x) { + return ompd_rc_stale_handle; + } // allocate both the thread handle and the cuda kernel info in one go ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t) + From b3de3ae57f77d7422fc225764a3d040ef919b4c5 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 1 Aug 2018 15:09:18 -0700 Subject: [PATCH 31/64] [OMPD] Add ompd break points for thread begin/end --- runtime/src/kmp_runtime.cpp | 19 +++++++++++++++++++ runtime/src/ompd-specific.cpp | 10 ++++++++++ runtime/src/ompd-specific.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp index 197a78a78..896d431a1 100644 --- a/runtime/src/kmp_runtime.cpp +++ b/runtime/src/kmp_runtime.cpp @@ -3843,6 +3843,10 @@ int __kmp_register_root(int initial_thread) { ompt_set_thread_state(root_thread, omp_state_work_serial); } #endif +#if OMPD_SUPPORT + if ( ompd_state & OMPD_ENABLE_BP ) + ompd_bp_thread_begin (); +#endif KMP_MB(); __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); @@ -3926,6 +3930,11 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) { __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); #endif /* KMP_OS_WINDOWS */ +#if OMPD_SUPPORT + if ( ompd_state & OMPD_ENABLE_BP ) + ompd_bp_thread_end (); +#endif + #if OMPT_SUPPORT if (ompt_enabled.ompt_callback_thread_end) { ompt_callbacks.ompt_callback(ompt_callback_thread_end)( @@ -5605,6 +5614,11 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) { } #endif +#if OMPD_SUPPORT + if ( ompd_state & OMPD_ENABLE_BP ) + ompd_bp_thread_begin (); +#endif + #if OMPT_SUPPORT if (ompt_enabled.enabled) { this_thr->th.ompt_thread_info.state = omp_state_idle; @@ -5673,6 +5687,11 @@ void *__kmp_launch_thread(kmp_info_t *this_thr) { } TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); +#if OMPD_SUPPORT + if ( ompd_state & OMPD_ENABLE_BP ) + ompd_bp_thread_end (); +#endif + #if OMPT_SUPPORT if (ompt_enabled.ompt_callback_thread_end) { ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); diff --git a/runtime/src/ompd-specific.cpp b/runtime/src/ompd-specific.cpp index 83275e1dd..2c5f4b796 100644 --- a/runtime/src/ompd-specific.cpp +++ b/runtime/src/ompd-specific.cpp @@ -116,6 +116,16 @@ void ompd_bp_task_end ( void ){ we might want to use a separate object file? */ asm (""); } +void ompd_bp_thread_begin ( void ){ + /* naive way of implementing hard to opt-out empty function + we might want to use a separate object file? */ + asm (""); +} +void ompd_bp_thread_end ( void ){ + /* naive way of implementing hard to opt-out empty function + we might want to use a separate object file? */ + asm (""); +} #endif /* OMPD_SUPPORT */ diff --git a/runtime/src/ompd-specific.h b/runtime/src/ompd-specific.h index ec3bdc1f0..d3b554008 100644 --- a/runtime/src/ompd-specific.h +++ b/runtime/src/ompd-specific.h @@ -18,6 +18,8 @@ void ompd_bp_parallel_begin ( void ); void ompd_bp_parallel_end ( void ); void ompd_bp_task_begin ( void ); void ompd_bp_task_end ( void ); +void ompd_bp_thread_begin ( void ); +void ompd_bp_thread_end ( void ); #ifdef __cplusplus } /* extern "C" */ #endif From b96b31c4f3db4b78b490ec3ed8c1e5f47bcf50eb Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 7 Aug 2018 11:54:13 -0700 Subject: [PATCH 32/64] [OMPD] Add some support: parallel handles (cuda) Adds support for ompd_get_current_parallel_handle and ompd_get_enclosing_parallel_handle for cuda devices. --- libompd/gdb-wrapper/OMPDCommand.cpp | 79 +++++++++ libompd/src/TargetValue.cpp | 28 +++ libompd/src/omp-debug.cpp | 163 ++++++++++++++---- libompd/src/omp-debug.h | 14 +- libompd/src/omp-icv.cpp | 3 + libompd/src/omp-state.cpp | 4 + .../deviceRTLs/nvptx/src/ompd-specific.cu | 26 ++- .../deviceRTLs/nvptx/src/ompd-specific.h | 28 ++- .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 9 + libomptarget/deviceRTLs/nvptx/src/parallel.cu | 19 ++ 10 files changed, 323 insertions(+), 50 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 61b7dbe45..5c61da70e 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -527,6 +527,54 @@ vector odbGetThreadHandles(ompd_address_space_handle_t* a return thread_handles; } +vector odbGetCudaThreadHandles(ompd_address_space_handle_t* addrhandle, OMPDFunctionsPtr functions) +{ + ompd_rc_t ret; + + CudaGdb cuda; + vector cuda_ContextPools; + map device_initialized; + map address_spaces; + vector device_thread_handles; + + for(auto i: cuda.threads) { + if (!device_initialized[i.coord.cudaContext]) { + OMPDCudaContextPool* cpool; + cpool = new OMPDCudaContextPool(&i); + ompd_rc_t result; + + device_initialized[i.coord.cudaContext] = true; + result = functions->ompd_device_initialize( + addrhandle, + cpool->getGlobalOmpdContext(), + ompd_device_kind_cuda, + sizeof(i.coord.cudaContext), + &i.coord.cudaContext, + &cpool->ompd_device_handle); + + if (result != ompd_rc_ok) + { + continue; + } + + address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle; + } + ompd_thread_handle_t* thread_handle; + ompd_rc_t ret = functions->ompd_get_thread_handle( + address_spaces[i.coord.cudaContext], + ompd_thread_id_cudalogical, + sizeof(i.coord), &i.coord, + &thread_handle); + + if (ret == ompd_rc_ok) + { + device_thread_handles.push_back(thread_handle); + } + } + + return device_thread_handles; +} + vector odbGetParallelRegions(OMPDFunctionsPtr functions, ompd_thread_handle_t* &th) { ompd_rc_t ret; @@ -731,6 +779,10 @@ const char* OMPDTest::toString() const void OMPDParallelRegions::execute() const { ompd_rc_t ret; + + // + // For the host runtime + // auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); OMPDParallelHandleCmp parallel_cmp_op(functions); @@ -761,6 +813,33 @@ void OMPDParallelRegions::execute() const for (auto &p: host_parallel_handles) { functions->ompd_release_parallel_handle(p.first); } + + // + // For Cuda devices + // + auto cuda_thread_handles = odbGetCudaThreadHandles(addrhandle, functions); + std::map, + OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op); + for (auto t: cuda_thread_handles) { + for (auto p: odbGetParallelRegions(functions, t)) { + cuda_parallel_handles[p].push_back(t); + } + } + + printf("DEVICE PARALLEL REGIONS\n"); + printf("Parallel Handle Num Threads \n"); + printf("------------------------------ \n"); + for (auto &p: cuda_parallel_handles) { + printf("%-15p %-10zu\n", p.first, p.second.size()); + } + + for (auto t: cuda_thread_handles) { + functions->ompd_release_thread_handle(t); + } + for (auto &p: cuda_parallel_handles) { + functions->ompd_release_parallel_handle(p.first); + } } const char *OMPDParallelRegions::toString() const diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index 8f2288257..1350d632c 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -73,8 +73,18 @@ ompd_rc_t TType::getSize(ompd_size_t *size) { << ") \\" << std::endl; return ret; } + symbolAddr.segment = descSegment; + // On cuda targets, ompd_sizeof_ and ompd_access_ symbols are alwazs in + // shared memory. + // This is a hack to ensure that we are not looking in global memory for + // it + // TODO (mr): Find a better solution + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) { + symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED; + } + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, 1 * TValue::type_sizes.sizeof_long_long, &(tmpSize)); @@ -145,6 +155,15 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) { } symbolAddr.segment = descSegment; + // On cuda targets, ompd_sizeof_ and ompd_access_ symbols are alwazs in + // shared memory. + // This is a hack to ensure that we are not looking in global memory for + // it + // TODO (mr): Find a better solution + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) { + symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED; + } + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, 1 * TValue::type_sizes.sizeof_long_long, &(tmpOffset)); @@ -182,6 +201,15 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) { } symbolAddr.segment = descSegment; + // On cuda targets, ompd_sizeof_ and ompd_access_ symbols are alwazs in + // shared memory. + // This is a hack to ensure that we are not looking in global memory for + // it + // TODO (mr): Find a better solution + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) { + symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED; + } + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, 1 * TValue::type_sizes.sizeof_long_long, &(tmpOffset)); diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 73c912cb9..bdeed24a7 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -23,7 +23,6 @@ #include #include -const ompd_callbacks_t *callbacks = nullptr; ompd_device_type_sizes_t type_sizes; /* --- OMPD functions ------------------------------------------------------- */ @@ -34,6 +33,8 @@ ompd_rc_t ompd_initialize(ompd_word_t version, const ompd_callbacks_t *table) { ompd_rc_t ret = table ? ompd_rc_ok : ompd_rc_bad_input; callbacks = table; TValue::callbacks = table; + __ompd_init_icvs(table); + __ompd_init_states(table); return ret; } @@ -246,15 +247,41 @@ ompd_rc_t ompd_get_current_parallel_handle( if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { ompd_address_t taddr; - TValue ph = TValue(context, thread_context, - "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED); + TValue prevTask = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0) + .access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .dereference(); + + ret = prevTask.getAddress(&taddr); + + if (ret != ompd_rc_ok) { + if (taddr.address == 0) { + prevTask = TValue(context, NULL, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("teamContext") + .cast("omptarget_nvptx_TeamDescr", 0) + .access("levelZeroTaskDescr") + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } else { + return ret; + } + } + TValue ph = prevTask.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0) + .access("enclosed_parallel"); + ret = ph.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; - ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), - (void **)(parallel_handle)); + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(parallel_handle)); if (ret != ompd_rc_ok) return ret; @@ -309,38 +336,92 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); + ompd_address_t taddr = parallel_handle->th, lwt; + ompd_rc_t ret; - ompd_rc_t ret = ompd_rc_stale_handle; - TValue lwtValue = TValue(context, parallel_handle->lwt); - if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 - { // if we are in lwt, get parent - ret = lwtValue.cast("ompt_lw_taskteam_t", 0) - .access("parent") - .cast("ompt_lw_taskteam_t", 1) - .dereference() - .getAddress(&lwt); - } - if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + uint16_t level; + TValue curParallelInfo = TValue(context, taddr) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_SHARED); - TValue teamdata = - TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_parent") /*t.t_parent*/ - .cast("kmp_team_p", 1) - .access("t"); /*t.t_parent->t*/ + ret = curParallelInfo + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("level") + .castBase(ompd_type_short) + .getValue(level); - ret = teamdata.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = teamdata.cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; + TValue prevTaskDescr = curParallelInfo.cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("prev") + .cast("omptarget_nvptx_TaskDescr", 1) + .dereference(); + + ret = prevTaskDescr.getAddress(&taddr); + + if (ret != ompd_rc_ok) { + if (taddr.address == 0 && level == 1) { + // If we are in generic mode, there is an implicit parallel region + // around the master thread + prevTaskDescr = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_levelZeroParallelInfo"); + } else { + return ret; + } + } else { + prevTaskDescr = prevTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0) + .access("enclosed_parallel"); + } + + + prevTaskDescr.cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getAddress(&taddr); + + } else { + ret = ompd_rc_stale_handle; + TValue lwtValue = TValue(context, parallel_handle->lwt); + if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 + { // if we are in lwt, get parent + ret = lwtValue.cast("ompt_lw_taskteam_t", 0) + .access("parent") + .cast("ompt_lw_taskteam_t", 1) + .dereference() + .getAddress(&lwt); + } + if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + + TValue teamdata = + TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_parent") /*t.t_parent*/ + .cast("kmp_team_p", 1) + .access("t"); /*t.t_parent->t*/ + + ret = teamdata.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = teamdata.cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + } } ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), @@ -350,6 +431,8 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( (*enclosing_parallel_handle)->th = taddr; (*enclosing_parallel_handle)->lwt = lwt; (*enclosing_parallel_handle)->ah = parallel_handle->ah; + (*enclosing_parallel_handle)->cuda_kernel_info = + parallel_handle->cuda_kernel_info; return ompd_rc_ok; } @@ -411,11 +494,17 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, return ompd_rc_stale_handle; if (!parallel_handle_2) return ompd_rc_stale_handle; - if (parallel_handle_1->th.address - parallel_handle_2->th.address) + if (parallel_handle_1->ah->kind != parallel_handle_2->ah->kind) + return ompd_rc_bad_input; + if (parallel_handle_1->ah->kind == OMP_DEVICE_KIND_HOST) { + if (parallel_handle_1->th.address - parallel_handle_2->th.address) + *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; + else + *cmp_value = + parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; + } else { *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; - else - *cmp_value = - parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; + } return ompd_rc_ok; } @@ -1035,14 +1124,14 @@ ompd_rc_t ompd_get_task_function( task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; TValue taskInfo; if(task_handle->lwt.address!=0) - return ompd_rc_bad_input; // We need to decide what we do here. + return ompd_rc_bad_input; // We need to decide what we do here. else ret = TValue(context, task_handle->th). cast("kmp_taskdata_t",0). /*t*/ getArrayElement(1). /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ cast("kmp_task_t",0). /* (kmp_task_t *) */ access("routine"). /*td->ompt_task_info*/ - castBase(). + castBase(). getValue(task_addr->address); return ret; } diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index a642cb315..6ca5840b5 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -35,13 +35,15 @@ extern "C" { /****************************************************************************** * General helper functions - */ -ompd_rc_t initTypeSizes(ompd_address_space_context_t *context); + */ + ompd_rc_t initTypeSizes(ompd_address_space_context_t *context); #ifdef __cplusplus -} + } + + +static const ompd_callbacks_t *callbacks = nullptr; -extern const ompd_callbacks_t *callbacks; class ompdAllocatable { public: @@ -127,4 +129,8 @@ typedef struct _ompd_task_handle_s : public ompdAllocatable { #endif +// TODO (mr) this is ugly, but better then a global symbol (?) +void __ompd_init_icvs(const ompd_callbacks_t *table); +void __ompd_init_states(const ompd_callbacks_t *table); + #endif /* SRC_OMP_DEBUG_H_ */ diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp index 13d5de20f..0b44c6796 100644 --- a/libompd/src/omp-icv.cpp +++ b/libompd/src/omp-icv.cpp @@ -14,6 +14,9 @@ macro (implicit_var, "ompd-implicit-var", ompd_scope_task) \ macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel) \ +void __ompd_init_icvs(const ompd_callbacks_t *table) { + callbacks = table; +} enum ompd_icv { ompd_icv_undefined_marker = 0, // ompd_icv_undefined is already defined in ompd.h diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp index 82b31fa48..df117a05b 100644 --- a/libompd/src/omp-state.cpp +++ b/libompd/src/omp-state.cpp @@ -3,6 +3,10 @@ #include "omp-debug.h" #include +void __ompd_init_states(const ompd_callbacks_t *table) { + callbacks = table; +} + static const char *get_ompd_state_name(ompd_word_t state) { switch (state) { #define ompd_state_macro(state, code) \ diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index aed0b4e13..cd53817bf 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -7,15 +7,15 @@ __device__ __shared__ static int ompd_target_initialized; -#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m; +#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m; OMPD_FOREACH_ACCESS(ompd_target_declare_access) #undef ompd_target_declare_access -#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m; +#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m; OMPD_FOREACH_ACCESS(ompd_target_declare_sizeof_member) #undef ompd_target_declare_sizeof_member -#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t; +#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t; OMPD_FOREACH_SIZEOF(ompd_target_declare_sizeof) #undef ompd_target_declare_sizeof @@ -31,24 +31,36 @@ __device__ void ompd_init ( void ) if (ompd_target_initialized) return; -#define ompd_target_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); +#define ompd_target_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); OMPD_FOREACH_ACCESS(ompd_target_init_access) #undef ompd_target_init_access - ompd_access__omptarget_nvptx_TaskDescr__items__threadId = + ompd_access__omptarget_nvptx_TaskDescr__items__threadId = (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadId); -#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); +#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); OMPD_FOREACH_ACCESS(ompd_target_init_sizeof_member) #undef ompd_target_init_sizeof_member ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadId = (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadId); -#define ompd_target_init_sizeof(t) ompd_sizeof__##t = sizeof(t); +#define ompd_target_init_sizeof(t) ompd_sizeof__##t = sizeof(t); OMPD_FOREACH_SIZEOF(ompd_target_init_sizeof) #undef ompd_target_init_sizeof + omptarget_nvptx_threadPrivateContext->ompd_levelZeroParallelInfo.level = 0; + if (isSPMDMode()) { + omptarget_nvptx_threadPrivateContext->teamContext.levelZeroTaskDescr + .ompd_thread_info.enclosed_parallel.parallel_tasks = + &omptarget_nvptx_threadPrivateContext->levelOneTaskDescr[0]; + } else { + // generic mode + omptarget_nvptx_threadPrivateContext->ompd_levelZeroParallelInfo + .parallel_tasks = &omptarget_nvptx_threadPrivateContext->teamContext + .levelZeroTaskDescr; + } + ompd_target_initialized = 1; } diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 226934284..64a2bf5f3 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -20,11 +20,17 @@ extern "C" __device__ void ompd_bp_task_end ( void ); #define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \ OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext, topTaskDescr) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,teamContext) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,ompd_levelZeroParallelInfo) \ OMPD_ACCESS(omptarget_nvptx_TaskDescr,ompd_thread_info) \ + OMPD_ACCESS(omptarget_nvptx_TaskDescr,prev) \ + OMPD_ACCESS(omptarget_nvptx_TeamDescr,levelZeroTaskDescr) \ OMPD_ACCESS(ompd_nvptx_thread_info_t,state) \ OMPD_ACCESS(ompd_nvptx_thread_info_t,threadIdx_x) \ - OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,teamContext) \ - OMPD_ACCESS(omptarget_nvptx_TeamDescr,levelZeroTaskDescr) + OMPD_ACCESS(ompd_nvptx_thread_info_t,enclosed_parallel) \ + OMPD_ACCESS(ompd_nvptx_parallel_info_t,level) \ + OMPD_ACCESS(ompd_nvptx_parallel_info_t,parallel_tasks) + #define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \ OMPD_SIZEOF(omptarget_nvptx_ThreadPrivateContext)\ @@ -52,6 +58,23 @@ INLINE void ompd_reset_device_thread_state() { ompd_set_device_thread_state(omp_state_work_serial); } +/* We store parallel info in the threadPrivateContext the same way that task + * descriptors are stored. Currently there is no support for nested + * parallelism (TODO: there will probably be in the future), so we store one + * parallel descriptor in the threadPrivateContext for the outermost parallel + * region and additonally one descriptor in each thread in case of serialized + * inner parallel regions + */ +typedef struct { + uint16_t level; + /* If level = 0, parallel_tasks points just to the master task descriptor + * if level = 1, parallel_tasks points to threadPrivateContext->levelOneTaskDescr + * if level > 1, we are in a serialized parallel region and parallel_tasks points + * to the single task in the parallel region. + */ + omptarget_nvptx_TaskDescr *parallel_tasks; +} ompd_nvptx_parallel_info_t; + typedef struct { uint64_t state; // In the host runtime we use the OMPT state. // Here we need to have our own place to store it. @@ -59,6 +82,7 @@ typedef struct { // To store a unique identifier for the current thread, we // simply store ThreadIdx.x and BlockIdx.x uint16_t threadIdx_x; + ompd_nvptx_parallel_info_t enclosed_parallel; } ompd_nvptx_thread_info_t; #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 899809fb0..0cd65a502 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -200,6 +200,11 @@ class omptarget_nvptx_TaskDescr { INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); +#ifdef OMPD_SUPPORT + INLINE ompd_nvptx_thread_info_t *ompd_ThreadInfo() { + return &ompd_thread_info; + } +#endif private: // bits for flags: (7 used, 1 free) @@ -394,6 +399,10 @@ class omptarget_nvptx_ThreadPrivateContext { Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; // Queue to which this object must be returned. uint64_t SourceQueue; +#ifdef OMPD_SUPPORT + // The implicit parallel region around the master task in generic mode + ompd_nvptx_parallel_info_t ompd_levelZeroParallelInfo; +#endif }; /// Device envrionment data diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 85173b5b2..f4e115614 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -305,6 +305,13 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, "only team master can create parallel"); #ifdef OMPD_SUPPORT + // Set ompd info for first level parallel region (this info is stored in the + // master threads task info, so it can easily be accessed + ompd_nvptx_parallel_info_t &nextPar = currTaskDescr->ompd_ThreadInfo() + ->enclosed_parallel; + nextPar.level = 1; + nextPar.parallel_tasks = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(0); // Move the previous thread into undefined state (will be reset in __kmpc_kernel_end_parallel) // TODO (mr) find a better place to do this ompd_set_device_thread_state(omp_state_undefined); @@ -415,6 +422,18 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { newTaskDescr->ThreadId() = 0; newTaskDescr->ThreadsInTeam() = 1; +#ifdef OMPD_SUPPORT + // Set ompd parallel info for the next parallel region in the previous task + // descriptor + ompd_nvptx_parallel_info_t &newPar = + currTaskDescr->ompd_ThreadInfo()->enclosed_parallel; + newPar.level = currTaskDescr->GetPrevTaskDescr() + ->ompd_ThreadInfo() + ->enclosed_parallel + .level + 1; + newPar.parallel_tasks = newTaskDescr; +#endif + // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); From 3609e86e762d43fd52d6094847b16da0e25fc3b1 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 8 Aug 2018 08:21:02 -0700 Subject: [PATCH 33/64] [OMPD] remove usage of kernel id in odb --- libompd/gdb-wrapper/OMPDContext.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/libompd/gdb-wrapper/OMPDContext.cpp b/libompd/gdb-wrapper/OMPDContext.cpp index 9b92e0d13..b344a1de4 100644 --- a/libompd/gdb-wrapper/OMPDContext.cpp +++ b/libompd/gdb-wrapper/OMPDContext.cpp @@ -139,7 +139,24 @@ ompd_thread_context_t * OMPDHostContext::getContextForThread(gdb_thread_id& thr_ bool OMPDCudaContext::setThisGdbContext() { - bool ret = false; + bool ret = true; + stringstream device_command; + stringstream coord_command; + device_command << "cuda device " << this->cudathread->coord.cudaDevId; + coord_command << "cuda grid " << this->cudathread->coord.gridId + << " block " << this->cudathread->coord.blockIdx.x + << " thread " << this->cudathread->coord.threadIdx.x; + OMPDContextPool::gdb->writeInput(device_command.str().c_str()); + string gdbOut = OMPDContextPool::gdb->readOutput(); + if (gdbOut.find("cannot be satisfied") != 0) + ret = false; + + OMPDContextPool::gdb->writeInput(coord_command.str().c_str()); + gdbOut = OMPDContextPool::gdb->readOutput(); + if (gdbOut.find("cannot be satisfied") != 0) + ret = false; + +#if 0 stringstream command; command #ifdef HACK_FOR_CUDA_GDB @@ -154,6 +171,7 @@ bool OMPDCudaContext::setThisGdbContext() string gdbOut = OMPDContextPool::gdb->readOutput(); if (gdbOut.find("not known")==0) ret = true; +#endif return ret; } From a619134aee808a93996e7ad4eaf1e774d717c60c Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 8 Aug 2018 08:25:55 -0700 Subject: [PATCH 34/64] [OMPD] Add ompd_get_thread_in_parallel for cuda --- libompd/src/omp-debug.cpp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index bdeed24a7..60363e4e6 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -182,16 +182,29 @@ ompd_rc_t ompd_get_thread_in_parallel( ompd_address_t taddr; - ret = TValue(context, parallel_handle->th) /* t */ - .cast("kmp_base_team_t", 0) - .access("t_threads") /*t.t_threads*/ - .cast("kmp_info_t", 2) - .getArrayElement(nth_handle) /*t.t_threads[nth_handle]*/ - .access("th") /*t.t_threads[i]->th*/ - .getAddress(&taddr); + if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + ret = TValue(context, parallel_handle->th) + .cast("ompd_npvtx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_npvtx_TaskDescr", 1) + .getPtrArrayElement(nth_handle) + .dereference() + .getAddress(&taddr); + + } else { + ret = TValue(context, parallel_handle->th) /* t */ + .cast("kmp_base_team_t", 0) + .access("t_threads") /*t.t_threads*/ + .cast("kmp_info_t", 2) + .getArrayElement(nth_handle) /*t.t_threads[nth_handle]*/ + .access("th") /*t.t_threads[i]->th*/ + .getAddress(&taddr); + } if (ret != ompd_rc_ok) return ret; + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), (void **)(thread_handle)); if (ret != ompd_rc_ok) From 011a37668a25fc89a9a35d3c2c4a663828cd7614 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 8 Aug 2018 10:08:12 -0700 Subject: [PATCH 35/64] [OMPD] Move memory segment values to ompd_types.h --- libompd/src/ompd-private.h | 24 ------------------------ libompd/src/ompd.h | 2 ++ libompd/src/ompd_types.h | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h index 6283aa33a..7210980aa 100644 --- a/libompd/src/ompd-private.h +++ b/libompd/src/ompd-private.h @@ -55,30 +55,6 @@ typedef enum omp_state_t { #define OMPD_LAST_OMP_STATE omp_state_overhead -#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1) -#define OMPD_SEGMENT_DATA ((ompd_seg_t)2) -/** - * The following definitions match with ptx information stored in DWARF - */ -#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) -#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) -#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) -#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) -#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) -#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) -#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) -#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) -#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) -#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) -#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) -#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) -#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) -#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) -#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) -#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) - /** * Primitive types. */ diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index d6ea310c8..1d78151e8 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -56,6 +56,8 @@ typedef struct ompd_address_t { ompd_addr_t address; /* target address in the segment */ } ompd_address_t; +const uint64_t ompd_segment_none = 0; + /* types for device and thread id KIND, not for the actual thread/device id */ typedef uint64_t omp_device_t; typedef uint64_t ompd_thread_id_t; diff --git a/libompd/src/ompd_types.h b/libompd/src/ompd_types.h index ff66e79c5..acc82f750 100644 --- a/libompd/src/ompd_types.h +++ b/libompd/src/ompd_types.h @@ -16,7 +16,31 @@ extern "C" { #define OMPD_THREAD_ID_CUDALOGICAL 3 #define OMPD_THREAD_ID_MAX 4 +#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) +#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1) +#define OMPD_SEGMENT_DATA ((ompd_seg_t)2) /** + * The following definitions match with ptx information stored in DWARF + */ +#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) +#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) +#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) +#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) +#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) +#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) +#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) +#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) +#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) +#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) +#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) +#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) +#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) +#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) +#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) +#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) +#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) + + /** * Logical coordinates of OMP target device threads */ typedef struct ompd_dim3_t { From 8d498871e88a324865318a54a505ba84796961aa Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 8 Aug 2018 13:57:21 -0700 Subject: [PATCH 36/64] [OMPD] Re-enable checking for ompd state tracking Also: remove get_*_data functions. Those are no longer part of the spec. --- libompd/src/omp-debug.cpp | 82 ++++----------------------------------- 1 file changed, 7 insertions(+), 75 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 60363e4e6..c9488af42 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -24,6 +24,7 @@ #include ompd_device_type_sizes_t type_sizes; +uint64_t ompd_state; /* --- OMPD functions ------------------------------------------------------- */ @@ -62,6 +63,12 @@ ompd_process_initialize(ompd_address_space_context_t (*addrhandle)->context = context; (*addrhandle)->kind = OMP_DEVICE_KIND_HOST; + ret = TValue(context, "ompd_state") + .castBase(ompd_type_long_long) + .getValue(ompd_state); + if (ret != ompd_rc_ok) + return ret; + return ompd_rc_ok; } @@ -733,42 +740,6 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, return ompd_rc_ok; } -/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ - -ompd_rc_t ompd_get_parallel_data( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *data /* OUT: OpenMP parallel id */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; -#if 0 - if (!ompd_state) - return ompd_rc_needs_state_tracking; -#endif - - assert(callbacks && "Callback table not initialized!"); - - TValue teamInfo; - if (parallel_handle->lwt.address != 0) - teamInfo = TValue(context, parallel_handle->lwt) - .cast("ompt_lw_taskteam_t", 0); /*lwt*/ - else - teamInfo = - TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/ - ompd_rc_t ret = teamInfo - .access("ompt_team_info") /*t.ompt_team_info*/ - .cast("ompt_team_info_t", 0) - .access("parallel_data") /*t.ompt_team_info.parallel_id*/ - .getAddress(data); - return ret; -} - - /* --- 7 Thread Inquiry ----------------------------------------------------- */ /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ @@ -975,10 +946,8 @@ ompd_rc_t ompd_get_state( ompd_address_space_context_t *context = thread_handle->ah->context; if (!context) return ompd_rc_stale_handle; -#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; -#endif ompd_rc_t ret; assert(callbacks && "Callback table not initialized!"); @@ -1033,10 +1002,8 @@ ompd_rc_t ompd_get_task_frame( ompd_address_space_context_t *context = task_handle->ah->context; if (!context) return ompd_rc_stale_handle; -#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; -#endif assert(callbacks && "Callback table not initialized!"); @@ -1070,39 +1037,6 @@ ompd_rc_t ompd_get_task_frame( return ret; } -ompd_rc_t -ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_address_t *task_data /* OUT: OpenMP task ID */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; -#if 0 - if (!ompd_state) - return ompd_rc_needs_state_tracking; -#endif - - assert(callbacks && "Callback table not initialized!"); - - TValue taskInfo; - if (task_handle->lwt.address != 0) - taskInfo = - TValue(context, task_handle->lwt).cast("ompt_lw_taskteam_t", 0); /*lwt*/ - else - taskInfo = TValue(context, task_handle->th).cast("kmp_taskdata_t", 0); /*t*/ - ompd_rc_t ret = taskInfo - .access("ompt_task_info") // td->ompt_task_info - .cast("ompt_task_info_t") - .access("task_data") // td->ompt_task_info.task_data - .getAddress(task_data); - - return ret; -} - #if 1 // the runtime currently does not have task function information ompd_rc_t ompd_get_task_function( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ @@ -1116,10 +1050,8 @@ ompd_rc_t ompd_get_task_function( ompd_address_space_context_t *context = task_handle->ah->context; if (!context) return ompd_rc_stale_handle; -#if 0 if (!ompd_state) return ompd_rc_needs_state_tracking; -#endif assert(callbacks && "Callback table not initialized!"); From 300c2189f98d0f93e8ef07e864a238dbe8997413 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 9 Aug 2018 14:10:12 -0700 Subject: [PATCH 37/64] Remove accidentally committed .bak file --- libompd/gdb-wrapper/ompd_typedefs.h.bak | 521 ------------------------ 1 file changed, 521 deletions(-) delete mode 100644 libompd/gdb-wrapper/ompd_typedefs.h.bak diff --git a/libompd/gdb-wrapper/ompd_typedefs.h.bak b/libompd/gdb-wrapper/ompd_typedefs.h.bak deleted file mode 100644 index 2bad5c82b..000000000 --- a/libompd/gdb-wrapper/ompd_typedefs.h.bak +++ /dev/null @@ -1,521 +0,0 @@ -/* - * ompd.h - * - * Created on: Dec 22, 2014 - * Author: Ignacio Laguna - * Joachim Protze - * Contact: ilaguna@llnl.gov - * protze@llnl.gov - */ -/****************************************************************************** - * This header file defines the OMPD interface: an interface to help debuggers - * to inspect state associated with OpenMP programming abstractions in a target - * process. The interface is implemented in a dynamically loaded library (DLL) - * that the debugger loads into its address space. - * - * Name conventions: - * - All named entities start with the prefix "ompd_" (for OpenMP debugging) - * - Type entities end with the suffix "_t" (for type) - * - Function types end with the suffix "_fn_t" (for function type) - * - Return code entities have "_rc_" in it - * - Abstractions referring to the target have the prefix "t" (e.g., - * "tmemory" for memory in the target, or "tsymbol" for symbol in the target) - * - Abstractions referring to the debugger have the prefix "d" (e.g., - * "dmemory" for memory in the debugger) - * - * Comment conventions: - * - Input function parameters denoted by "IN:" - * - Output function parameters denoted by "OUT:" - */ -/****************************************************************************** - * General types and data structures - */ -/** - * Basic types. - */ -/** - * The following definitions match with ptx information stored in DWARF - */ -/* - * Definition of OMPD states, taken from OMPT - */ -/** - * Context handle. - * This is used by the debugger to identify a target process (or core file). - * This will be cast to concrete types within the debugger. The callbacks use - * context handles to specify the debugger where to look up (since the debugger - * can be handling different contexts at the same time, e.g., processes and/or - * core files). Without context handles the debugger would not know the target - * of a callback request. - */ -/** - * OpenMP abstractions handles. - * Each operation in the OMPD interface must explicitly specify a handle for the - * context of the operation. OMPD uses context handles for OpenMP entities, such - * as threads, parallel regions, and tasks. A handle for an entity is constant - * while the entity itself is live. - */ -/** - * Other handles. - */ -/** - * Logical coordinates of OMP target device threads - */ -/** - * Return codes. - * Each OMPD operation returns a code. - */ -/** - * Primitive types. - */ -/** - * Primitive type sizes. - * These types are used by OMPD to interrogate the debugger about the size of - * primitive types in the target. - */ -/****************************************************************************** - * Debugger callback signatures. - * These callback function signatures are used by OMPD to obtain state - * information of a target process, in particular to interrogate about info - * that is dependent on a particular OpenMP runtime library. Typical queries are - * sizes of primitive types in the target, symbols lookup, lookup of offsets of - * fields in a type/structure, and read/write to memory in the target. - */ -/** - * Allocate memory in the debugger's address space. - */ -/** - * Free memory in the debugger's address space. - */ -/** - * Get thread specific context. - */ -/** - * Get containing (host) process context for address_space_context - */ -/** - * Look up the sizes of primitive types in the target - */ -/** - * Look up the address of a global symbol in the target - */ -/** - * Read memory from the target - */ -/** - * Write memory from the target - */ -/** - * This is used by the OMPD library to have the debugger print a string. - * The OMPD should not print directly. - */ -/** - * Callbacks table. - */ -/****************************************************************************** - * Call signatures from the debugger to the OMPD DLL. - */ -/* --- 4 Initialization ----------------------------------------------------- */ - -/** - * The OMPD function ompd_get_version_string returns a descriptive string - * describing an implementation of the OMPD library. The function - * ompd_get_version_compatibility returns an integer code used to indicate the - * revision of the OMPD specification supported by an implementation of OMPD. - */ -typedef ompd_rc_t (*ompd_get_version_fn_t) ( - int *version - ); -typedef ompd_rc_t (*ompd_get_version_string_fn_t) ( - const char **string /* OUT: OMPD version string */ - ); -/** - * Initialize OMPD. - * This provides the DLL the pointers to the debugger's functions to obtain - * information about the OpenMP runtime library. The debugger promises to - * maintain the functions valid for as long as needed. - */ -typedef ompd_rc_t (*ompd_initialize_fn_t) ( - const ompd_callbacks_t *table /* IN: callbacks table */ - ); -typedef ompd_rc_t (*ompd_process_initialize_fn_t) ( - ompd_address_space_context_t *context, /* IN: debugger handle for the target */ - ompd_address_space_handle_t **addrhandle /* OUT: ompd handle for the target */ - ); -typedef ompd_rc_t (*ompd_release_address_space_handle_fn_t) ( - ompd_address_space_handle_t *addr_handle /* IN: handle for the address space */ - ); -typedef ompd_rc_t (*ompd_device_initialize_fn_t) ( - ompd_address_space_context_t *context, /* IN: debugger handle for the device */ - ompd_device_identifier_t id, /* IN: object defined by native device API */ - ompd_device_kind_t kind, /* IN: */ - ompd_address_space_handle_t **addrhandle /* OUT: ompd handle for the device */ - ); -typedef ompd_rc_t (*ompd_finalize_fn_t) ( void ); - -/* --- 4 Handle Management -------------------------------------------------- */ - -/* --- 4.1 Thread Handles --------------------------------------------------- */ - -/** - * Retrieve handles for all OpenMP threads. - * - * The ompd_get_threads operation enables the debugger to obtain handles for all - * OpenMP threads. A successful invocation of ompd_get_threads returns a pointer - * to a vector of handles in thread_handle_array and returns the number of - * handles in num_handles. This call yields meaningful results only if all - * OpenMP threads are stopped; otherwise, the OpenMP runtime may be creating - * and/or destroying threads during or after the call, rendering useless the - * vector of handles returned. - */ -typedef ompd_rc_t (*ompd_get_threads_fn_t) ( - ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */ - int *num_handles /* OUT: number of handles in the array */ - ); -/** - * Retrieve handles for OpenMP threads in a parallel region. - * - * The ompd_get_thread_in_parallel operation enables the debugger to obtain - * handles for all OpenMP threads associated with a parallel region. A - * successful invocation of ompd_get_thread_in_parallel returns a pointer to a - * vector of handles in thread_handle_array and returns the number of handles in - * num_handles. This call yields meaningful results only if all OpenMP threads - * in the parallel region are stopped; otherwise, the OpenMP runtime may be - * creating and/or destroying threads during or after the call, rendering - * useless the vector of handles returned. - */ -typedef ompd_rc_t (*ompd_get_thread_in_parallel_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN */ - ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */ - int *num_handles /* OUT: number of handles in the array */ - ); -typedef ompd_rc_t (*ompd_get_master_thread_in_parallel_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN */ - ompd_thread_handle_t **thread_handle); -typedef ompd_rc_t (*ompd_release_thread_handle_fn_t) ( - ompd_thread_handle_t *thread_handle -); -typedef ompd_rc_t (*ompd_thread_handle_compare_fn_t) ( - ompd_thread_handle_t *thread_handle_1, - ompd_thread_handle_t *thread_handle_2, - int *cmp_value -); -typedef ompd_rc_t (*ompd_get_thread_handle_string_id_fn_t) ( - ompd_thread_handle_t *thread_handle, - char **string_id -); -/* --- 4.2 Parallel Region Handles------------------------------------------- */ - -/** - * Retrieve the handle for the innermost patallel region for an OpenMP thread. - * - * The operation ompd_get_top_parallel_region enables the debugger to obtain - * the handle for the innermost parallel region associated with an OpenMP - * thread. This call is meaningful only if the thread whose handle is provided - * is stopped. - */ -typedef ompd_rc_t (*ompd_get_top_parallel_region_fn_t) ( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_parallel_handle_t **parallel_handle /* OUT: OpenMP parallel handle */ - ); -/** - * Retrieve the handle for an enclosing parallel region. - * - * The ompd_get_enclosing_parallel_handle operation enables the debugger to - * obtain the handle for the parallel region enclosing the parallel region - * specified by parallel_handle. This call is meaningful only if at least one - * thread in the parallel region is stopped. - */ -typedef ompd_rc_t (*ompd_get_enclosing_parallel_handle_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */ - ); -/** - * Retrieve the handle for the enclosing parallel region or a task region. - * - * The ompd_get_task_enclosing_parallel_handle operation enables the debugger to - * obtain the handle for the parallel region enclosing the task region - * specified by task_handle. This call is meaningful only if at least one - * thread in the parallel region is stopped. - */ -typedef ompd_rc_t (*ompd_get_task_enclosing_parallel_handle_fn_t) ( - ompd_task_handle_t* task_handle, /* IN: OpenMP task handle */ - ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */ - ); -typedef ompd_rc_t (*ompd_release_parallel_handle_fn_t) ( - ompd_parallel_handle_t *parallel_handle -); -typedef ompd_rc_t (*ompd_parallel_handle_compare_fn_t) ( - ompd_parallel_handle_t *parallel_handle_1, - ompd_parallel_handle_t *parallel_handle_2, - int *cmp_value -); -typedef ompd_rc_t (*ompd_get_parallel_handle_string_id_fn_t) ( - ompd_parallel_handle_t *parallel_handle, - char **string_id -); -/* --- 4.3 Task Handles ----------------------------------------------------- */ - -/** - * Retrieve the handle for the innermost task for an OpenMP thread. - * - * The debugger uses the operation ompd_get_top_task_region to obtain the handle - * for the innermost task region associated with an OpenMP thread. This call is - * meaningful only if the thread whose handle is provided is stopped. - */ -typedef ompd_rc_t (*ompd_get_top_task_region_fn_t) ( - ompd_thread_handle_t* thread_handle, /* IN: OpenMP thread handle*/ - ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ - ); -/** - * Retrieve the handle for an enclosing task. - * - * The debugger uses ompd_get_ancestor_task_region to obtain the handle for the - * task region enclosing the task region specified by task_handle. This call is - * meaningful only if the thread executing the task specified by task_handle is - * stopped. - */ -typedef ompd_rc_t (*ompd_get_ancestor_task_region_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ); -typedef ompd_rc_t (*ompd_get_generating_ancestor_task_region_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ); -typedef ompd_rc_t (*ompd_get_scheduling_ancestor_task_region_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ); -/** - * Retrieve implicit task handle for a parallel region. - * - * The ompd_get_implicit_task_in_parallel operation enables the debugger to - * obtain handles for implicit tasks associated with a parallel region. This - * call is meaningful only if all threads associated with the parallel region - * are stopped. - */ -typedef ompd_rc_t (*ompd_get_implicit_task_in_parallel_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_task_handle_t ***task_handle_array, /* OUT: array of OpenMP task handles */ - int *num_handles /* OUT: number of task handles */ - ); -typedef ompd_rc_t (*ompd_release_task_handle_fn_t) ( - ompd_task_handle_t *task_handle -); -typedef ompd_rc_t (*ompd_task_handle_compare_fn_t) ( - ompd_task_handle_t *task_handle_1, - ompd_task_handle_t *task_handle_2, - int *cmp_value -); -typedef ompd_rc_t (*ompd_get_task_handle_string_id_fn_t) ( - ompd_task_handle_t *task_handle, - char **string_id -); -/* --- 5o Process and Thread Settings ---------------------------------------- */ - -/** - * The functions ompd_get_num_procs and ompd_get_thread_limit are third-party - * versions of the OpenMP runtime functions omp_get_num_procs and - * omp_get_thread_limit. - */ -typedef ompd_rc_t (*ompd_get_num_procs_fn_t) ( - ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ); -typedef ompd_rc_t (*ompd_get_thread_limit_fn_t) ( - ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ); - /* --- 6 Parallel Region Inqueries ------------------------------------------ */ - /* --- 6.1 Settings --------------------------------------------------------- */ - /** - * Determine the number of threads associated with a parallel region. - */ - typedef ompd_rc_t (*ompd_get_num_threads_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ); - /** - * Determine the nesting depth of a particular parallel region instance. - */ - typedef ompd_rc_t (*ompd_get_level_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ); - /** - * Determine the number of enclosing active parallel regions. - * - * ompd_get_active_level returns the number of nested, active parallel regions - * enclosing the parallel region specified by its handle. - */ - typedef ompd_rc_t (*ompd_get_active_level_fn_t) ( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ); - /* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ - - /* --- 7 Thread Inquiry ----------------------------------------------------- */ - /* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ - /** - * Obtain an OpenMP thread handle and the internal OS thread handle for the - * selected (context) thread. - * If the function returns ompd_rc_ok then the operating system thread - * corresponds to an OpenMP thread and the thread_handle is initialized. The - * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime - * system. - */ - typedef ompd_rc_t (*ompd_get_thread_handle_fn_t) ( - ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, - ompd_size_t sizeof_osthread, - const void* osthread, - ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ - ); - /** - * Obtain the OS thread handle for an OpenMP thread handle. - * this might change over time in case virtual openmp threads migrate between - * OS threads. - */ - typedef ompd_rc_t (*ompd_get_osthread_fn_t) ( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_thread_id_kind_t kind, - ompd_size_t sizeof_osthread, - void *osthread - ); - typedef ompd_rc_t (*ompd_get_thread_num_fn_t) ( - ompd_thread_handle_t* thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *val /* OUT: number of the thread within the team */ - ); - /* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */ - - /** - * Get the state of a thread. This can use OMPT state data structure to define - * different states of threads (e.g., idle, working, or barrier, etc) and what - * entity cased this state (e.g., address of a lock); - * - * The function ompd_get_state is a third-party version of ompt_get_state. The - * only difference between the OMPD and OMPT counterparts is that the OMPD - * version must supply a thread handle to provide a context for this inquiry. - */ - typedef ompd_rc_t (*ompd_get_state_fn_t) ( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *state, /* OUT: State of this thread */ - ompd_wait_id_t *wait_id /* OUT: Wait ID */ - ); - /* --- 8 Task Inquiry ------------------------------------------------------- */ - - /* --- 8.1 Task Function Entry Point ---------------------------------------- */ - - /** - * The ompd_get_task_function returns the entry point of the code that - * corresponds to the body of code executed by the task. - */ - typedef ompd_rc_t (*ompd_get_task_function_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_address_t *entry_point /* OUT: first instruction in the task region */ - ); - /* --- 8.2 Task Settings ---------------------------------------------------- */ - - /** - * Retrieve information from OpenMP tasks. These inquiry functions have no - * counterparts in the OMPT interface as a first-party tool can call OpenMP - * runtime inquiry functions directly. The only difference between the OMPD - * inquiry operations and their counterparts in the OpenMP runtime is that the - * OMPD version must supply a task handle to provide a context for each inquiry. - */ - typedef ompd_rc_t (*ompd_get_max_threads_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ); - typedef ompd_rc_t (*ompd_in_parallel_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: Is OpenMP in parallel? */ - ); - typedef ompd_rc_t (*ompd_in_final_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: Is OpenMP in final? */ - ); - typedef ompd_rc_t (*ompd_get_dynamic_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: ? */ - ); - typedef ompd_rc_t (*ompd_get_nested_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_word_t *val /* OUT: Is this task nested? */ - ); - typedef ompd_rc_t (*ompd_get_max_active_levels_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_word_t *val /* OUT: max active levels */ - ); -#if 0 - typedef ompd_rc_t (*ompd_get_schedule_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_sched_t *kind, /* OUT: Kind of OpenMP schedule*/ - ompd_word_t *modifier /* OUT: Schedunling modifier */ - ); -#endif - typedef ompd_rc_t (*ompd_get_proc_bind_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_proc_bind_t *bind /* OUT: Kind of proc-binding */ - ); - typedef ompd_rc_t (*ompd_is_implicit_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: implicit=1, explicit=0 */ - ); -/* --- 8.3 OMPT Task Inquiry Analogues -------------------------------------- */ - -/** - * The functions defined here are third-party versions of ompt_get_task_frame - * and ompt_get_task_id. The only difference between the OMPD and OMPT - * counterparts is that the OMPD version must supply a task handle to provide a - * context for these inquiries. - */ -/** - * sp_exit - * - * This value is set once, the first time that a task exits the runtime to begin - * executing user code. This field points to the stack frame of the runtime - * procedure that called the user code. This value is NULL until just before the - * task exits the runtime. - * - * sp_reentry - * - * This value is set each time that current task re-enters the runtime to create - * new (implicit or explicit) tasks. This field points to the stack frame of the - * runtime procedure called by a task to re-enter the runtime. This value is NULL - * until just after the task re-enters the runtime. - */ -typedef ompd_rc_t (*ompd_get_task_frame_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_address_t *sp_exit, /* OUT: next frame is user code */ - ompd_address_t *sp_reentry /* OUT: previous frame is user code */ - ); -#if 0 -typedef ompd_rc_t (*ompd_get_task_id_fn_t) ( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_id_t *task_id /* OUT: OpenMP task ID */ - ); -#endif -/* --- 13 Display Control Variables ----------------------------------------- */ - -/** - * Using the ompd_display_control_vars function, the debugger can extract a - * string that contains a sequence of name/value pairs of control variables - * whose settings are (a) user controllable, and (b) important to the operation - * or performance of an OpenMP runtime system. The control variables exposed - * through this interface will include all of the OMP environment variables, - * settings that may come from vendor or platform- specific environment - * variables (e.g., the IBM XL compiler has an environment variable that - * controls spinning vs. blocking behavior), and other settings that affect - * the operation or functioning of an OpenMP runtime system (e.g., numactl - * settings that cause threads to be bound to cores). - */ -typedef ompd_rc_t (*ompd_get_display_control_vars_fn_t) ( - ompd_address_space_handle_t *handle, /* IN */ - const char * const **control_var_values /* OUT */ -); -typedef ompd_rc_t (*ompd_release_display_control_vars_fn_t) ( - const char * const **control_var_values /* IN */ -); From 783262c620c9b2447455e866a6ed2325af504854 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 13 Aug 2018 15:57:10 -0700 Subject: [PATCH 38/64] [OMPD] Make odb initialization at first command ODB now initializes the openmp process with ompd when the first ompd commadn is issued, not when the debugger starts. This is necessary because we now check for state tracking in the process initialization function, and OpenMP runtime may not be present when the debugger has just loaded the program. --- libompd/gdb-wrapper/OMPDCommand.cpp | 36 ++++++++++++++++++----------- libompd/gdb-wrapper/OMPDCommand.h | 3 ++- 2 files changed, 25 insertions(+), 14 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 5c61da70e..6a9d1289c 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -105,7 +105,20 @@ OMPDCommandFactory::OMPDCommandFactory() FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) #undef OMPD_FIND_API_FUNCTION +} + +OMPDCommandFactory::~OMPDCommandFactory() +{ + ompd_rc_t ret; + ret = functions->ompd_release_address_space_handle(addrhandle); + if (ret != ompd_rc_ok) + { + out << "ERROR: could not finalize target address space\n"; + } +} +void OMPDCommandFactory::initOmpd() +{ // Initialize OMPD library ompd_callbacks_t *table = getCallbacksTable(); assert(table && "Invalid callbacks table"); @@ -119,19 +132,12 @@ FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) /*&prochandle, */&addrhandle); if (ret != ompd_rc_ok) { + addrhandle = nullptr; out << "ERROR: could not initialize target process\n"; } - - icvs = OMPDIcvsPtr(new OMPDIcvs(functions, addrhandle)); -} - -OMPDCommandFactory::~OMPDCommandFactory() -{ - ompd_rc_t ret; - ret = functions->ompd_release_address_space_handle(addrhandle); - if (ret != ompd_rc_ok) + else { - out << "ERROR: could not finalize target address space\n"; + icvs = OMPDIcvsPtr(new OMPDIcvs(functions, addrhandle)); } } @@ -152,8 +158,12 @@ void * OMPDCommandFactory::findFunctionInLibrary(const char *fun) const return ret; } -OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& extraArgs) const +OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& extraArgs) { + if (addrhandle == nullptr) { + initOmpd(); + } + if (strcmp(str, "test") == 0) return new OMPDTestCallbacks(functions, addrhandle, extraArgs); else if (strcmp(str, "threads") == 0) @@ -312,12 +322,12 @@ void OMPDThreads::execute() const { ompd_word_t state; device_thread_handles.push_back(thread_handle); - functions->ompd_get_state(thread_handle, &state, NULL); + ret = functions->ompd_get_state(thread_handle, &state, NULL); if (last_state == -1) { last_state = state; last_coords = i.coord; printf("(%li,0,0) (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x); - } else if (state != last_state || i.coord.blockIdx.x != last_coords.blockIdx.x) { + } else if (state != last_state || i.coord.blockIdx.x != last_coords.blockIdx.x || i.coord.threadIdx.x != last_coords.threadIdx.x + 1) { printf(" (%li,0,0) %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]); last_coords = i.coord; last_state = state; diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index f6e3a867d..c575c5f06 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -155,6 +155,7 @@ class OMPDCommandFactory { private: void * findFunctionInLibrary(const char *fun) const; + void initOmpd(); OMPDFunctionsPtr functions = nullptr; OMPDIcvsPtr icvs = nullptr; // ompd_process_handle_t* prochandle = nullptr; @@ -165,7 +166,7 @@ class OMPDCommandFactory OMPDCommandFactory(); ~OMPDCommandFactory(); // OMPDCommand* create(const char *str) const; - OMPDCommand* create(const char *str, const std::vector& extraArgs=std::vector()) const; + OMPDCommand* create(const char *str, const std::vector& extraArgs=std::vector()); }; typedef std::unique_ptr OMPDCommandFactoryPtr; From 17f53de786187aebf231e46b0b3ba737d0fa22ec Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 13 Aug 2018 16:00:26 -0700 Subject: [PATCH 39/64] [OMPD] Add code to support some ICVs on cuda devs --- libompd/src/omp-debug.cpp | 1 + libompd/src/omp-icv.cpp | 51 ++++++++++++++++++- .../deviceRTLs/nvptx/src/ompd-specific.cu | 12 ++--- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 949c6efd9..9638bbaa6 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -232,6 +232,7 @@ ompd_rc_t ompd_get_thread_in_parallel( (*thread_handle)->th = taddr; (*thread_handle)->ah = parallel_handle->ah; + (*thread_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info; return ret; } diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp index 0b44c6796..a5d287b09 100644 --- a/libompd/src/omp-icv.cpp +++ b/libompd/src/omp-icv.cpp @@ -92,6 +92,30 @@ static ompd_rc_t ompd_get_level( return ret; } + +static ompd_rc_t ompd_get_cuda_level( + ompd_parallel_handle_t *parallel_handle, + ompd_word_t *val) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized"); + + uint16_t res; + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("ompd_npvtx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("level") + .castBase(ompd_type_short) + .getValue(res); + *val = res; + return ret; +} + + static ompd_rc_t ompd_get_active_level( ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ ompd_word_t *val /* OUT: active nesting level */ @@ -341,7 +365,32 @@ ompd_get_num_threads(ompd_parallel_handle_t *val = res; } return ret; -} +} + +static ompd_rc_t +ompd_get_cuda_num_threads(ompd_parallel_handle_t *parallel_handle, + ompd_word_t *val) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized"); + + uint16_t res; + + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_npvtx_TaskDescr", 1) + .access("items__threadsInTeam") + .castBase(ompd_type_short) + .getValue(res); + *val = res; + return ret; +} ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, ompd_icv_id_t icv_id, diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index cd53817bf..c23058f1a 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -20,10 +20,10 @@ OMPD_FOREACH_ACCESS(ompd_target_declare_access) #undef ompd_target_declare_sizeof __device__ __shared__ - uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadId; + uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam; __device__ __shared__ - uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadId; + uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam; __device__ void ompd_init ( void ) @@ -35,15 +35,15 @@ __device__ void ompd_init ( void ) OMPD_FOREACH_ACCESS(ompd_target_init_access) #undef ompd_target_init_access - ompd_access__omptarget_nvptx_TaskDescr__items__threadId = - (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadId); + ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam = + (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam); #define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); OMPD_FOREACH_ACCESS(ompd_target_init_sizeof_member) #undef ompd_target_init_sizeof_member - ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadId = - (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadId); + ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam = + (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam); #define ompd_target_init_sizeof(t) ompd_sizeof__##t = sizeof(t); OMPD_FOREACH_SIZEOF(ompd_target_init_sizeof) From 598ec682419e03aa7f305273ec05941d92b59be1 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 14 Aug 2018 08:54:07 -0700 Subject: [PATCH 40/64] [OMPD] removed ompdAllocatable class libompd does not use the new operator and instead uses the tool provides malloc callback directly. --- libompd/src/omp-debug.h | 42 +++++++---------------------------------- 1 file changed, 7 insertions(+), 35 deletions(-) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index 6ca5840b5..bd738ece8 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -45,37 +45,9 @@ extern "C" { static const ompd_callbacks_t *callbacks = nullptr; -class ompdAllocatable { -public: - static void *operator new(std::size_t sz) { - void *res; - ompd_rc_t ret = callbacks->memory_alloc(sz, &res); - if (ret == ompd_rc_ok) - return res; - throw std::bad_alloc(); - } - static void *operator new[](std::size_t sz) { - void *res; - ompd_rc_t ret = callbacks->memory_alloc(sz, &res); - if (ret == ompd_rc_ok) - return res; - throw std::bad_alloc(); - } - void operator delete(void *addr) throw() { - ompd_rc_t ret = callbacks->memory_free(addr); - if (ret != ompd_rc_ok) - throw std::bad_alloc(); - } - void operator delete[](void *addr) throw() { - ompd_rc_t ret = callbacks->memory_free(addr); - if (ret != ompd_rc_ok) - throw std::bad_alloc(); - } -}; - // Information shared by all threads in a kernel // Used to map thread handles to native cuda thread ids -typedef struct _ompd_cuda_thread_kernel_info_s : public ompdAllocatable { +typedef struct _ompd_cuda_thread_kernel_info_s { ompd_addr_t cudaDevId; ompd_addr_t cudaContext; ompd_addr_t warpSize; @@ -85,29 +57,29 @@ typedef struct _ompd_cuda_thread_kernel_info_s : public ompdAllocatable { typedef struct _ompd_address_space_context_s ompd_address_space_context_t; -typedef struct _ompd_process_handle_s : public ompdAllocatable { +typedef struct _ompd_process_handle_s { ompd_address_space_context_t *context; } ompd_process_handle_t; -typedef struct _ompd_address_space_handle_s : public ompdAllocatable { +typedef struct _ompd_address_space_handle_s { ompd_address_space_context_t *context; omp_device_t kind; uint64_t id; } ompd_address_space_handle_t; -typedef struct _ompd_device_handle_s : public ompdAllocatable { +typedef struct _ompd_device_handle_s { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ } ompd_device_handle_t; -typedef struct _ompd_thread_handle_s : public ompdAllocatable { +typedef struct _ompd_thread_handle_s { ompd_address_space_handle_t *ah; ompd_thread_context_t *thread_context; ompd_address_t th; /* target handle */ ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* only valid for cuda */ } ompd_thread_handle_t; -typedef struct _ompd_parallel_handle_s : public ompdAllocatable { +typedef struct _ompd_parallel_handle_s { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ ompd_address_t lwt; /* lwt handle */ @@ -117,7 +89,7 @@ typedef struct _ompd_parallel_handle_s : public ompdAllocatable { */ } ompd_parallel_handle_t; -typedef struct _ompd_task_handle_s : public ompdAllocatable { +typedef struct _ompd_task_handle_s { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ ompd_address_t lwt; /* lwt handle */ From 21d2ad73d05fbb4ddbf8819009c80bf811e82555 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 14 Aug 2018 08:56:08 -0700 Subject: [PATCH 41/64] [OMPD] Remove _ompd_device_handle_s struct Its not part of the spec and not used by libompd. --- libompd/src/omp-debug.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index bd738ece8..b783323b6 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -67,11 +67,6 @@ typedef struct _ompd_address_space_handle_s { uint64_t id; } ompd_address_space_handle_t; -typedef struct _ompd_device_handle_s { - ompd_address_space_handle_t *ah; - ompd_address_t th; /* target handle */ -} ompd_device_handle_t; - typedef struct _ompd_thread_handle_s { ompd_address_space_handle_t *ah; ompd_thread_context_t *thread_context; From 6351c6f1587babd8b97b5eadc0702f68ed9e8e89 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 23 Aug 2018 14:43:44 -0700 Subject: [PATCH 42/64] [OMPD] Work around for cuda-gdb name mangling bug --- libompd/src/TargetValue.cpp | 21 +++++++++++++++++++ libompd/src/omp-debug.cpp | 3 ++- .../deviceRTLs/nvptx/src/ompd-specific.cu | 20 +++++++++--------- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index ecb564dec..109c8fa01 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -49,6 +49,13 @@ ompd_rc_t TType::getSize(ompd_size_t *size) { ompd_size_t tmpSize; std::stringstream ss; ss << "ompd_sizeof__" << typeName; + + // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr) + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL || + descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) { + ss << "_"; + } + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { @@ -129,6 +136,13 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) { // &fieldOffset); std::stringstream ss; ss << "ompd_access__" << typeName << "__" << fieldName; + + // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr) + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL || + descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) { + ss << "_"; + } + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { @@ -175,6 +189,13 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) { // &fieldOffset); std::stringstream ss; ss << "ompd_sizeof__" << typeName << "__" << fieldName; + + // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr) + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL || + descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) { + ss << "_"; + } + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 9638bbaa6..aad8f830c 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -307,7 +307,8 @@ ompd_rc_t ompd_get_current_parallel_handle( } } TValue ph = prevTask.access("ompd_thread_info") - .cast("ompd_nvptx_thread_info_t", 0) + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("enclosed_parallel"); ret = ph.getAddress(&taddr); diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index c23058f1a..61bb8413c 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -7,23 +7,23 @@ __device__ __shared__ static int ompd_target_initialized; -#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m; +#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m##_; OMPD_FOREACH_ACCESS(ompd_target_declare_access) #undef ompd_target_declare_access -#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m; +#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m##_; OMPD_FOREACH_ACCESS(ompd_target_declare_sizeof_member) #undef ompd_target_declare_sizeof_member -#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t; +#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t##_; OMPD_FOREACH_SIZEOF(ompd_target_declare_sizeof) #undef ompd_target_declare_sizeof __device__ __shared__ - uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam; + uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam_; __device__ __shared__ - uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam; + uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam_; __device__ void ompd_init ( void ) @@ -31,21 +31,21 @@ __device__ void ompd_init ( void ) if (ompd_target_initialized) return; -#define ompd_target_init_access(t,m) ompd_access__##t##__##m = (uint64_t)&(((t*)0)->m); +#define ompd_target_init_access(t,m) ompd_access__##t##__##m##_ = (uint64_t)&(((t*)0)->m); OMPD_FOREACH_ACCESS(ompd_target_init_access) #undef ompd_target_init_access - ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam = + ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam_ = (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam); -#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m = sizeof(((t*)0)->m); +#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m##_ = sizeof(((t*)0)->m); OMPD_FOREACH_ACCESS(ompd_target_init_sizeof_member) #undef ompd_target_init_sizeof_member - ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam = + ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam_ = (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam); -#define ompd_target_init_sizeof(t) ompd_sizeof__##t = sizeof(t); +#define ompd_target_init_sizeof(t) ompd_sizeof__##t##_ = sizeof(t); OMPD_FOREACH_SIZEOF(ompd_target_init_sizeof) #undef ompd_target_init_sizeof From 4b6f24bcc9b4e892934fe3dff1ffbf91c3f36d32 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 23 Aug 2018 14:50:08 -0700 Subject: [PATCH 43/64] [OMPD] Add support for cuda icvs to odb --- libompd/gdb-wrapper/OMPDCommand.cpp | 64 ++++++------ libompd/gdb-wrapper/OMPDCommand.h | 19 ---- libompd/src/omp-debug.cpp | 4 +- libompd/src/omp-icv.cpp | 150 ++++++++++++++++++++-------- 4 files changed, 148 insertions(+), 89 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 6a9d1289c..eada18eff 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -537,41 +537,40 @@ vector odbGetThreadHandles(ompd_address_space_handle_t* a return thread_handles; } -vector odbGetCudaThreadHandles(ompd_address_space_handle_t* addrhandle, OMPDFunctionsPtr functions) +map odbInitCudaDevices(OMPDFunctionsPtr functions, CudaGdb &cuda, + ompd_address_space_handle_t *addrhandle) { - ompd_rc_t ret; - - CudaGdb cuda; - vector cuda_ContextPools; + map ret; map device_initialized; - map address_spaces; - vector device_thread_handles; - - for(auto i: cuda.threads) { + for (auto i: cuda.threads) { if (!device_initialized[i.coord.cudaContext]) { - OMPDCudaContextPool* cpool; - cpool = new OMPDCudaContextPool(&i); - ompd_rc_t result; - + ret.emplace(i.coord.cudaContext, &i); device_initialized[i.coord.cudaContext] = true; - result = functions->ompd_device_initialize( + functions->ompd_device_initialize( addrhandle, - cpool->getGlobalOmpdContext(), - ompd_device_kind_cuda, + ret.at(i.coord.cudaContext).getGlobalOmpdContext(), + OMP_DEVICE_KIND_CUDA, sizeof(i.coord.cudaContext), &i.coord.cudaContext, - &cpool->ompd_device_handle); + &ret.at(i.coord.cudaContext).ompd_device_handle); + } + } + return ret; +} - if (result != ompd_rc_ok) - { - continue; - } +vector odbGetCudaThreadHandles( + OMPDFunctionsPtr functions, + CudaGdb &cuda, + map &device_handles) +{ + ompd_rc_t ret; - address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle; - } + vector device_thread_handles; + + for(auto i: cuda.threads) { ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( - address_spaces[i.coord.cudaContext], + device_handles.at(i.coord.cudaContext).ompd_device_handle, ompd_thread_id_cudalogical, sizeof(i.coord), &i.coord, &thread_handle); @@ -827,7 +826,9 @@ void OMPDParallelRegions::execute() const // // For Cuda devices // - auto cuda_thread_handles = odbGetCudaThreadHandles(addrhandle, functions); + CudaGdb cuda; + auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle); + auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles); std::map, OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op); @@ -837,11 +838,18 @@ void OMPDParallelRegions::execute() const } } + // For instantiation, it doesnt matter which device handle we use for + // OMPDIcvs, just use the first one + + OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle); + printf("DEVICE PARALLEL REGIONS\n"); - printf("Parallel Handle Num Threads \n"); - printf("------------------------------ \n"); + printf("Parallel Handle Num Threads ICV level\n"); + printf("------------------------------------------\n"); for (auto &p: cuda_parallel_handles) { - printf("%-15p %-10zu\n", p.first, p.second.size()); + ompd_word_t icv_level; + cudaIcvs.get(p.first, "levels-var", &icv_level); + printf("%-15p %-10zu %ld\n", p.first, p.second.size(), icv_level); } for (auto t: cuda_thread_handles) { diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index c575c5f06..5c221b57c 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -33,25 +33,6 @@ #include "ompd_typedefs.h" //#include "ompd_test.h" - -/* - * The macro is used to create code to register all implemented ompd - * API functions with the CommandFactory - * For new implemented API function just add a new OMPD_DO line - */ - - -#define FOREACH_OMPD_CALLBACK_FN(macro) \ -macro(ompd_dmemory_alloc) \ -macro(ompd_dmemory_free) \ -macro(ompd_tsizeof_prim) \ -macro(ompd_tsymbol_addr) \ -macro(ompd_ttype) \ -macro(ompd_ttype_sizeof) \ -macro(ompd_ttype_offset) \ -macro(ompd_tmemory_access) \ -macro(ompd_print_string) - #define FOREACH_OMPD_API_FN(macro) \ macro(ompd_process_initialize) \ macro(ompd_device_initialize) \ diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index aad8f830c..c35f2d613 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -204,10 +204,10 @@ ompd_rc_t ompd_get_thread_in_parallel( if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { ret = TValue(context, parallel_handle->th) - .cast("ompd_npvtx_parallel_info_t", 0, + .cast("ompd_nvptx_parallel_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("parallel_tasks") - .cast("omptarget_npvtx_TaskDescr", 1) + .cast("omptarget_nvptx_TaskDescr", 1) .getPtrArrayElement(nth_handle) .dereference() .getAddress(&taddr); diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp index a5d287b09..3e1601d63 100644 --- a/libompd/src/omp-icv.cpp +++ b/libompd/src/omp-icv.cpp @@ -2,17 +2,17 @@ #include "ompd-private.h" #include "TargetValue.h" -#define FOREACH_OMPD_ICV(macro) \ - macro (levels_var, "levels-var", ompd_scope_parallel) \ - macro (active_levels_var, "active-levels-var", ompd_scope_parallel) \ - macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space) \ - macro (max_active_levels_var, "max-active-levels-var", ompd_scope_task) \ - macro (bind_var, "bind-var", ompd_scope_task) \ - macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space) \ - macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread) \ - macro (final_var, "ompd-final-var", ompd_scope_task) \ - macro (implicit_var, "ompd-implicit-var", ompd_scope_task) \ - macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel) \ +#define FOREACH_OMPD_ICV(macro) \ + macro (levels_var, "levels-var", ompd_scope_parallel, 1) \ + macro (active_levels_var, "active-levels-var", ompd_scope_parallel, 0) \ + macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space, 0) \ + macro (max_active_levels_var, "max-active-levels-var", ompd_scope_task, 0) \ + macro (bind_var, "bind-var", ompd_scope_task, 0) \ + macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space, 0) \ + macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread, 1) \ + macro (final_var, "ompd-final-var", ompd_scope_task, 0) \ + macro (implicit_var, "ompd-implicit-var", ompd_scope_task, 0) \ + macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel, 1) \ void __ompd_init_icvs(const ompd_callbacks_t *table) { callbacks = table; @@ -20,7 +20,7 @@ void __ompd_init_icvs(const ompd_callbacks_t *table) { enum ompd_icv { ompd_icv_undefined_marker = 0, // ompd_icv_undefined is already defined in ompd.h -#define ompd_icv_macro(v, n, s) ompd_icv_ ## v, +#define ompd_icv_macro(v, n, s, d) ompd_icv_ ## v, FOREACH_OMPD_ICV(ompd_icv_macro) #undef ompd_icv_macro ompd_icv_after_last_icv @@ -28,18 +28,57 @@ enum ompd_icv { static const char *ompd_icv_string_values[] = { "undefined", -#define ompd_icv_macro(v, n, s) n, +#define ompd_icv_macro(v, n, s, d) n, FOREACH_OMPD_ICV(ompd_icv_macro) #undef ompd_icv_macro }; static const ompd_scope_t ompd_icv_scope_values[] = { ompd_scope_global, // undefined marker -#define ompd_icv_macro(v, n, s) s, +#define ompd_icv_macro(v, n, s, d) s, FOREACH_OMPD_ICV(ompd_icv_macro) #undef ompd_icv_macro }; +static const uint8_t ompd_icv_available_cuda[] = { + 1, // undefined marker +#define ompd_icv_macro(v, n, s, d) d, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro + 1, // icv after last icv marker +}; + + +static ompd_rc_t ompd_enumerate_icvs_cuda(ompd_icv_id_t current, + ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, + int *more) { + int next_possible_icv = current; + do { + next_possible_icv++; + } while (!ompd_icv_available_cuda[next_possible_icv]); + + if (next_possible_icv >= ompd_icv_after_last_icv) { + return ompd_rc_bad_input; + } + + *next_id = next_possible_icv; + *next_icv_name = ompd_icv_string_values[*next_id]; + *next_scope = ompd_icv_scope_values[*next_id]; + + do { + next_possible_icv++; + } while (!ompd_icv_available_cuda[next_possible_icv]); + + if (next_possible_icv >= ompd_icv_after_last_icv) { + *more = 0; + } else { + *more = 1; + } + return ompd_rc_ok; +} + ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, ompd_icv_id_t current, ompd_icv_id_t *next_id, const char **next_icv_name, @@ -49,7 +88,8 @@ ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, return ompd_rc_stale_handle; } if (handle->kind == OMP_DEVICE_KIND_CUDA) { - return ompd_rc_unsupported; + return ompd_enumerate_icvs_cuda(current, next_id, next_icv_name, + next_scope, more); } if (current + 1 >= ompd_icv_after_last_icv) { return ompd_rc_bad_input; @@ -93,7 +133,7 @@ static ompd_rc_t ompd_get_level( } -static ompd_rc_t ompd_get_cuda_level( +static ompd_rc_t ompd_get_level_cuda( ompd_parallel_handle_t *parallel_handle, ompd_word_t *val) { if (!parallel_handle->ah) @@ -106,7 +146,7 @@ static ompd_rc_t ompd_get_cuda_level( uint16_t res; ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("ompd_npvtx_parallel_info_t", 0, + .cast("ompd_nvptx_parallel_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("level") .castBase(ompd_type_short) @@ -384,7 +424,7 @@ ompd_get_cuda_num_threads(ompd_parallel_handle_t *parallel_handle, .cast("ompd_nvptx_parallel_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("parallel_tasks") - .cast("omptarget_npvtx_TaskDescr", 1) + .cast("omptarget_nvptx_TaskDescr", 1) .access("items__threadsInTeam") .castBase(ompd_type_short) .getValue(res); @@ -405,30 +445,60 @@ ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, return ompd_rc_bad_input; } - switch (icv_id) { - case ompd_icv_levels_var: - return ompd_get_level((ompd_parallel_handle_t *)handle, icv_value); - case ompd_icv_active_levels_var: - return ompd_get_active_level((ompd_parallel_handle_t *)handle, icv_value); - case ompd_icv_thread_limit_var: - return ompd_get_thread_limit((ompd_address_space_handle_t*)handle, icv_value); - case ompd_icv_max_active_levels_var: - return ompd_get_max_active_levels((ompd_task_handle_t*)handle, icv_value); - case ompd_icv_bind_var: - return ompd_get_proc_bind((ompd_task_handle_t*)handle, icv_value); - case ompd_icv_num_procs_var: - return ompd_get_num_procs((ompd_address_space_handle_t*)handle, icv_value); - case ompd_icv_thread_num_var: - return ompd_get_thread_num((ompd_thread_handle_t*)handle, icv_value); - case ompd_icv_final_var: - return ompd_in_final((ompd_task_handle_t*)handle, icv_value); - case ompd_icv_implicit_var: - return ompd_is_implicit((ompd_task_handle_t*)handle, icv_value); - case ompd_icv_team_size_var: - return ompd_get_num_threads((ompd_parallel_handle_t*)handle, icv_value); + omp_device_t device_kind; + + switch (scope) { + case ompd_scope_thread: + device_kind = ((ompd_thread_handle_t *)handle)->ah->kind; + break; + case ompd_scope_parallel: + device_kind = ((ompd_parallel_handle_t *)handle)->ah->kind; + break; + case ompd_scope_address_space: + device_kind = ((ompd_address_space_handle_t *)handle)->kind; + break; + case ompd_scope_task: + device_kind = ((ompd_task_handle_t *)handle)->ah->kind; + break; default: - return ompd_rc_unsupported; + return ompd_rc_bad_input; + } + + + if (device_kind == OMP_DEVICE_KIND_HOST) { + switch (icv_id) { + case ompd_icv_levels_var: + return ompd_get_level((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_active_levels_var: + return ompd_get_active_level((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_thread_limit_var: + return ompd_get_thread_limit((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_max_active_levels_var: + return ompd_get_max_active_levels((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_bind_var: + return ompd_get_proc_bind((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_num_procs_var: + return ompd_get_num_procs((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_thread_num_var: + return ompd_get_thread_num((ompd_thread_handle_t*)handle, icv_value); + case ompd_icv_final_var: + return ompd_in_final((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_implicit_var: + return ompd_is_implicit((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_team_size_var: + return ompd_get_num_threads((ompd_parallel_handle_t*)handle, icv_value); + default: + return ompd_rc_unsupported; + } + } else if (device_kind == OMP_DEVICE_KIND_CUDA) { + switch (icv_id) { + case ompd_icv_levels_var: + return ompd_get_level_cuda((ompd_parallel_handle_t *)handle, icv_value); + default: + return ompd_rc_unsupported; + } } + return ompd_rc_unsupported; } ompd_rc_t From 0322eabe9cf9036f3b25ce74528cd39cbd5a7656 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 27 Aug 2018 08:41:28 -0700 Subject: [PATCH 44/64] [OMPD] Fix enclosing_parallel + enable target ICVs --- libompd/gdb-wrapper/OMPDCommand.cpp | 12 ++++-- libompd/src/omp-debug.cpp | 60 +++++++++++++++++++---------- libompd/src/omp-icv.cpp | 9 +++-- 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index eada18eff..f802dab1f 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -844,12 +844,13 @@ void OMPDParallelRegions::execute() const OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle); printf("DEVICE PARALLEL REGIONS\n"); - printf("Parallel Handle Num Threads ICV level\n"); - printf("------------------------------------------\n"); + printf("Parallel Handle Num Threads ICV Num Threads ICV level\n"); + printf("------------------------------------------------------------\n"); for (auto &p: cuda_parallel_handles) { - ompd_word_t icv_level; + ompd_word_t icv_level, icv_num_threads; + cudaIcvs.get(p.first, "ompd-team-size-var", &icv_num_threads); cudaIcvs.get(p.first, "levels-var", &icv_level); - printf("%-15p %-10zu %ld\n", p.first, p.second.size(), icv_level); + printf("%-15p %-10zu %-14ld %ld\n", p.first, p.second.size(), icv_num_threads, icv_level); } for (auto t: cuda_thread_handles) { @@ -858,6 +859,9 @@ void OMPDParallelRegions::execute() const for (auto &p: cuda_parallel_handles) { functions->ompd_release_parallel_handle(p.first); } + for (auto &d: cuda_device_handles) { + functions->ompd_release_address_space_handle(d.second.ompd_device_handle); + } } const char *OMPDParallelRegions::toString() const diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index c35f2d613..31f646e8c 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -290,26 +290,25 @@ ompd_rc_t ompd_get_current_parallel_handle( ret = prevTask.getAddress(&taddr); + TValue ph; if (ret != ompd_rc_ok) { if (taddr.address == 0) { - prevTask = TValue(context, NULL, - "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED) - .cast("omptarget_nvptx_ThreadPrivateContext", 1, - OMPD_SEGMENT_CUDA_PTX_SHARED) - .access("teamContext") - .cast("omptarget_nvptx_TeamDescr", 0) - .access("levelZeroTaskDescr") - .cast("omptarget_nvptx_TaskDescr", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL); + ph = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_levelZeroParallelInfo") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); } else { return ret; } + } else { + ph = prevTask.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("enclosed_parallel"); } - TValue ph = prevTask.access("ompd_thread_info") - .cast("ompd_nvptx_thread_info_t", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .access("enclosed_parallel"); ret = ph.getAddress(&taddr); if (ret != ompd_rc_ok) @@ -397,12 +396,30 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( .cast("omptarget_nvptx_TaskDescr", 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("prev") - .cast("omptarget_nvptx_TaskDescr", 1) + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) .dereference(); ret = prevTaskDescr.getAddress(&taddr); + // If the previous task of the tasks of the current parallel region is + // NULL, then we got the parallel handle for the (implicit?) top level + // task which has no enclosing task. if (ret != ompd_rc_ok) { + return ret; + } + + // The instance of TaskDescr for the previous task contains the parallel + // info for the current parallel region. So we have to go back to the + // previous task of the previous task + prevTaskDescr = prevTaskDescr.access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .dereference(); + + ret = prevTaskDescr.getAddress(&taddr); + + if (ret != ompd_rc_ok) { if (taddr.address == 0 && level == 1) { // If we are in generic mode, there is an implicit parallel region // around the master thread @@ -416,15 +433,18 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( } } else { prevTaskDescr = prevTaskDescr.access("ompd_thread_info") - .cast("ompd_nvptx_thread_info_t", 0) + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("enclosed_parallel"); } + ret = prevTaskDescr.cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getAddress(&taddr); - prevTaskDescr.cast("ompd_nvptx_parallel_info_t", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .getAddress(&taddr); - + if (ret != ompd_rc_ok) { + return ret; + } } else { ret = ompd_rc_stale_handle; TValue lwtValue = TValue(context, parallel_handle->lwt); diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp index 3e1601d63..54207001c 100644 --- a/libompd/src/omp-icv.cpp +++ b/libompd/src/omp-icv.cpp @@ -408,7 +408,7 @@ ompd_get_num_threads(ompd_parallel_handle_t } static ompd_rc_t -ompd_get_cuda_num_threads(ompd_parallel_handle_t *parallel_handle, +ompd_get_num_threads_cuda(ompd_parallel_handle_t *parallel_handle, ompd_word_t *val) { if (!parallel_handle->ah) return ompd_rc_stale_handle; @@ -424,9 +424,10 @@ ompd_get_cuda_num_threads(ompd_parallel_handle_t *parallel_handle, .cast("ompd_nvptx_parallel_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("parallel_tasks") - .cast("omptarget_nvptx_TaskDescr", 1) + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("items__threadsInTeam") - .castBase(ompd_type_short) + .castBase() .getValue(res); *val = res; return ret; @@ -494,6 +495,8 @@ ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, switch (icv_id) { case ompd_icv_levels_var: return ompd_get_level_cuda((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_team_size_var: + return ompd_get_num_threads_cuda((ompd_parallel_handle_t*)handle, icv_value); default: return ompd_rc_unsupported; } From 098dd559417365d2072647bf92ec10ab0ac3e9b4 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 30 Aug 2018 16:30:47 -0700 Subject: [PATCH 45/64] [OMPD] Fix ompd_get_thread_in_parallel + add test --- libompd/gdb-wrapper/OMPDCommand.cpp | 128 ++++++++++++++++++++++++--- libompd/gdb-wrapper/OMPDCommand.h | 19 +++- libompd/gdb-wrapper/StringParser.cpp | 1 + libompd/src/omp-debug.cpp | 67 +++++++++++--- libompd/src/omp-debug.h | 1 - 5 files changed, 193 insertions(+), 23 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index f802dab1f..a30c880c7 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -175,7 +175,7 @@ OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& e else if (strcmp(str, "api") == 0) return new OMPDApi(functions, addrhandle, extraArgs); else if (strcmp(str, "testapi") == 0) - return new OMPDTest(functions, addrhandle, extraArgs); + return new OMPDTest(functions, addrhandle, icvs, extraArgs); else if (strcmp(str, "parallel") == 0) return new OMPDParallelRegions(functions, addrhandle, icvs, extraArgs); else if (strcmp(str, "tasks") == 0) @@ -703,19 +703,58 @@ vector odbGetImplicitTasks(OMPDFunctionsPtr functions, ompd ph, i, &task_handle); return_handles.push_back(task_handle); } -#if 0 - ompd_task_handle_t* task_handles; - /*ret = */functions->ompd_get_task_in_parallel( - ph, &task_handles, &num_tasks); - for(int i=0; i thread_handles) { + ompd_rc_t ret; + bool check_passed = true; + int64_t icv_num_threads; + int64_t icv_level; + + icvs->get(ph, "levels-var", &icv_level); + + ret = icvs->get(ph, "ompd-team-size-var", &icv_num_threads); + if (ret != ompd_rc_ok) { + cout << "Error: could not retrieve icv 'ompd-team-size-var' (" << ret << ")" << endl; + return false; + } + + OMPDThreadHandleCmp thread_cmp_op(functions); + std::set unique_thread_handles(thread_handles.begin(), + thread_handles.end(), + thread_cmp_op); + + sout << "Checking parallel region with level " << icv_level << " and " + << icv_num_threads << " threads (overall " << unique_thread_handles.size() + << " associated threads)" << endl; + + ompd_thread_handle_t *th; + for(int i = 0; i < icv_num_threads; i++) { + ret = functions->ompd_get_thread_in_parallel(ph, i, &th); + if (ret != ompd_rc_ok) { + cout << "Could not retrieve thread handle " << i << " in parallel (" << ret << ")" << endl; + check_passed = false; + continue; + } + + auto matched_th = unique_thread_handles.find(th); + if (matched_th == unique_thread_handles.end()) { + cout << "Thread handle retrieved with ompd_get_thread_in_parallel doesn't match any thread associated with the parallel region (could already have been matched)" << endl; + check_passed = false; + } else { + sout << "Found matching thread for thread " << i << " in parallel region" << endl; + // we dont want a thread matched twice + unique_thread_handles.erase(matched_th); + } + functions->ompd_release_thread_handle(th); + } + return check_passed; +} + void OMPDTest::execute() const { // ompd_rc_t ret; @@ -776,8 +815,75 @@ void OMPDTest::execute() const functions->ompd_release_thread_handle(thr_h); } } + else if (extraArgs[0] == "parallel-threads") + { + // Checks if the thread handles returned by ompd_get_thread_in_parallel make sense + if (extraArgs.size() > 1) { + hout << "Usage: odb testapi parallel-threads" << endl; + return; + } + // Check host parallel regions + auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); + OMPDParallelHandleCmp parallel_cmp_op(functions); + std::map, + OMPDParallelHandleCmp> host_parallel_handles(parallel_cmp_op); + for (auto t: host_thread_handles) { + for (auto parallel_handle: odbGetParallelRegions(functions, t)) + { + host_parallel_handles[parallel_handle].push_back(t); + } + } + + bool host_check_passed = true; + for (auto &ph_threads: host_parallel_handles) { + if (!odbCheckThreadsInParallel(functions, icvs, ph_threads.first, ph_threads.second)) { + host_check_passed = false; + } + } + + cout << "Host check passed: " << host_check_passed << "\n" << endl; + + for (auto ph: host_parallel_handles) { + functions->ompd_release_parallel_handle(ph.first); + } + + for (auto th: host_thread_handles) { + functions->ompd_release_thread_handle(th); + } + + // + // For Cuda devices + // + CudaGdb cuda; + auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle); + auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles); + std::map, + OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op); + for (auto t: cuda_thread_handles) { + for (auto p: odbGetParallelRegions(functions, t)) { + cuda_parallel_handles[p].push_back(t); + } + } + + // For instantiation, it doesnt matter which device handle we use for + // OMPDIcvs, just use the first one + + auto cudaIcvs = OMPDIcvsPtr(new OMPDIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle)); + + bool cuda_check_passed = true; + for (auto ph_threads: cuda_parallel_handles) { + if (!odbCheckThreadsInParallel(functions, cudaIcvs, ph_threads.first, ph_threads.second)) { + cuda_check_passed = false; + } + } + + cout << "Cuda check passed: " << cuda_check_passed << endl; + return; + } } const char* OMPDTest::toString() const diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index 5c221b57c..756658a69 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -117,6 +117,19 @@ class OMPDParallelHandleCmp } }; +class OMPDThreadHandleCmp +{ + OMPDFunctionsPtr functions; +public: + OMPDThreadHandleCmp(const OMPDFunctionsPtr &f) + : functions(f) {} + bool operator()(ompd_thread_handle_t *a, ompd_thread_handle_t *b) { + int cmp = 0; + functions->ompd_thread_handle_compare(a, b, &cmp); + return cmp < 0; + } +}; + class OMPDTaskHandleCmp { OMPDFunctionsPtr functions; @@ -273,9 +286,13 @@ class OMPDTest : public OMPDCommand void execute() const; const char* toString() const; protected: - OMPDTest(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const std::vector& args) : OMPDCommand(f, ah, args){}; + OMPDTest(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, + const OMPDIcvsPtr &icvs, const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {}; friend OMPDCommandFactory; +private: + OMPDIcvsPtr icvs; }; class OMPDParallelRegions : public OMPDCommand diff --git a/libompd/gdb-wrapper/StringParser.cpp b/libompd/gdb-wrapper/StringParser.cpp index 0df120459..d1cc67063 100644 --- a/libompd/gdb-wrapper/StringParser.cpp +++ b/libompd/gdb-wrapper/StringParser.cpp @@ -225,6 +225,7 @@ vector StringParser::matchCudaThreadsInfo( coord.cudaContext = ctx; coord.cudaDevId = dev; coord.kernelId = kernel; + coord.warpSize = 0; for (int b = 0; b < threadcounts.size(); ++b) { coord.blockIdx.x = b; diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 31f646e8c..8bbc6236b 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -193,7 +193,6 @@ ompd_rc_t ompd_get_thread_in_parallel( return ompd_rc_stale_handle; ompd_address_space_context_t *context = parallel_handle->ah->context; ompd_rc_t ret; - int i; if (!context) return ompd_rc_stale_handle; @@ -203,15 +202,45 @@ ompd_rc_t ompd_get_thread_in_parallel( ompd_address_t taddr; if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { - ret = TValue(context, parallel_handle->th) - .cast("ompd_nvptx_parallel_info_t", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .access("parallel_tasks") - .cast("omptarget_nvptx_TaskDescr", 1) - .getPtrArrayElement(nth_handle) - .dereference() - .getAddress(&taddr); + uint16_t thread_idx; + // We cannot use the task descriptor associated with the parallel info as + // their task might not be currently active + // So to get the current thread, we access the tasks thread info and get + // get its threadIdx.x + auto TaskDescr = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getArrayElement(nth_handle); + ret = TaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("threadIdx_x") + .castBase(ompd_type_short) + .getValue(thread_idx); + + if (ret != ompd_rc_ok) { + return ret; + } + + ret = TValue(context, NULL, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("topTaskDescr") + .cast("omptarget_nvptx_TaskDescr", 2, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getPtrArrayElement(thread_idx) + .dereference() + .getAddress(&taddr); + + if (taddr.address == 0 && thread_idx % 32 == 0) { + ret = TaskDescr.getAddress(&taddr); + } } else { ret = TValue(context, parallel_handle->th) /* t */ .cast("kmp_base_team_t", 0) @@ -254,7 +283,26 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, return ompd_rc_stale_handle; if (!thread_handle_2) return ompd_rc_stale_handle; + if (thread_handle_1->ah->kind != thread_handle_2->ah->kind) + return ompd_rc_bad_input; *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; + if (*cmp_value == 0 && thread_handle_1->ah->kind == OMP_DEVICE_KIND_CUDA) { + *cmp_value = thread_handle_1->cuda_kernel_info->cudaDevId - + thread_handle_2->cuda_kernel_info->cudaDevId; + if (*cmp_value == 0) { + *cmp_value = thread_handle_1->cuda_kernel_info->cudaContext - + thread_handle_2->cuda_kernel_info->cudaContext; + } + if (*cmp_value == 0) { + *cmp_value = thread_handle_1->cuda_kernel_info->warpSize - + thread_handle_2->cuda_kernel_info->warpSize; + } + if (*cmp_value == 0) { + *cmp_value = thread_handle_1->cuda_kernel_info->gridId - + thread_handle_2->cuda_kernel_info->gridId; + } + } + return ompd_rc_ok; } @@ -877,7 +925,6 @@ ompd_get_thread_handle(ompd_address_space_handle_t (*thread_handle)->cuda_kernel_info->cudaContext = p->cudaContext; (*thread_handle)->cuda_kernel_info->warpSize = p->warpSize; (*thread_handle)->cuda_kernel_info->gridId = p->gridId; - (*thread_handle)->cuda_kernel_info->kernelId = p->kernelId; } else { ret = TValue(context, tcontext, "__kmp_gtid") .castBase("__kmp_gtid") diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index b783323b6..706f91644 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -52,7 +52,6 @@ typedef struct _ompd_cuda_thread_kernel_info_s { ompd_addr_t cudaContext; ompd_addr_t warpSize; ompd_addr_t gridId; - ompd_addr_t kernelId; } ompd_cuda_thread_kernel_info_t; typedef struct _ompd_address_space_context_s ompd_address_space_context_t; From 66b0339e6d26d9e3e60ce73b388615094c06f05b Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 4 Sep 2018 11:25:55 -0700 Subject: [PATCH 46/64] [OMPD] Update ompd_types.h + remove kernelId field --- libompd/gdb-wrapper/StringParser.cpp | 1 - libompd/src/ompd_types.h | 113 +++++++++++++-------------- 2 files changed, 55 insertions(+), 59 deletions(-) diff --git a/libompd/gdb-wrapper/StringParser.cpp b/libompd/gdb-wrapper/StringParser.cpp index d1cc67063..e3ebf3f93 100644 --- a/libompd/gdb-wrapper/StringParser.cpp +++ b/libompd/gdb-wrapper/StringParser.cpp @@ -224,7 +224,6 @@ vector StringParser::matchCudaThreadsInfo( coord.gridId = grid; coord.cudaContext = ctx; coord.cudaDevId = dev; - coord.kernelId = kernel; coord.warpSize = 0; for (int b = 0; b < threadcounts.size(); ++b) { diff --git a/libompd/src/ompd_types.h b/libompd/src/ompd_types.h index acc82f750..a3e3b15c0 100644 --- a/libompd/src/ompd_types.h +++ b/libompd/src/ompd_types.h @@ -1,69 +1,66 @@ -#ifndef OMPD_TYPES_H_ -#define OMPD_TYPES_H_ +/* +* @@name: ompd_types.h +*/ +#ifndef __OPMD_TYPES_H +#define __OPMD_TYPES_H +#include "omp_types.h" +#include "ompd.h" -#ifdef __cplusplus -extern "C" { -#endif - -// Values for omp_device_kind -#define OMP_DEVICE_KIND_HOST 1 -#define OMP_DEVICE_KIND_CUDA 2 - -// Values for ompd_thread_id_t -#define OMPD_THREAD_ID_PTHREAD 0 -#define OMPD_THREAD_ID_LWP 1 -#define OMPD_THREAD_ID_WINTHREAD 2 -#define OMPD_THREAD_ID_CUDALOGICAL 3 -#define OMPD_THREAD_ID_MAX 4 +#define OMPD_TYPES_VERSION 20170927 /* YYYYMMDD Format */ -#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1) -#define OMPD_SEGMENT_DATA ((ompd_seg_t)2) -/** - * The following definitions match with ptx information stored in DWARF - */ -#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) -#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) -#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) -#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) -#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) -#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) -#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) -#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) -#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) -#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) -#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) -#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) -#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) -#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) -#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) -#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) +/* Kinds of device threads */ +#define OMPD_THREAD_ID_PTHREAD ((ompd_thread_id_t)0) +#define OMPD_THREAD_ID_LWP ((ompd_thread_id_t)1) +#define OMPD_THREAD_ID_WINTHREAD ((ompd_thread_id_t)2) +#define OMPD_THREAD_ID_CUDALOGICAL ((ompd_thread_id_t)3) +/* The range of non-standard implementation defined values */ +#define OMPD_THREAD_ID_LO ((ompd_thread_id_t)1000000) +#define OMPD_THREAD_ID_HI ((ompd_thread_id_t)1100000) - /** - * Logical coordinates of OMP target device threads - */ +/* Target Cuda device-specific thread identification */ typedef struct ompd_dim3_t { - ompd_word_t x; - ompd_word_t y; - ompd_word_t z; + ompd_addr_t x; + ompd_addr_t y; + ompd_addr_t z; } ompd_dim3_t; typedef struct ompd_cudathread_coord_t { - ompd_addr_t cudaDevId; - ompd_addr_t cudaContext; - ompd_addr_t warpSize; - ompd_addr_t gridId; - ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work - // with grids too well. - ompd_dim3_t gridDim; - ompd_dim3_t blockDim; - ompd_dim3_t blockIdx; - ompd_dim3_t threadIdx; + ompd_addr_t cudaDevId; + ompd_addr_t cudaContext; + ompd_addr_t warpSize; + ompd_addr_t gridId; + ompd_dim3_t gridDim; + ompd_dim3_t blockDim; + ompd_dim3_t blockIdx; + ompd_dim3_t threadIdx; } ompd_cudathread_coord_t; -#ifdef __cplusplus -} +/* Memory Access Segment definitions for Host and Target Devices */ +#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) + +/* Cuda-specific values consistent with those defined in cudadebugger.h */ +#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) +#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) +#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) +#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) +#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) +#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) +#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) +#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) +#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) +#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) +#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) +#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) +#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) +#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) +#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) +#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) + +/* Kinds of device device address spaces */ +#define OMP_DEVICE_KIND_HOST ((omp_device_t)1) +#define OMP_DEVICE_KIND_CUDA ((omp_device_t)2) +/* The range of non-standard implementation defined values */ +#define OMP_DEVICE_IMPL_LO ((omp_device_t)1000000) +#define OMP_DEVICE_IMPL_HI ((omp_device_t)1100000) #endif -#endif /*OMPD_TYPES_H_*/ From 09ed60b37021c6a94c9c0e04ca2fff2e11f43cf8 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Wed, 5 Sep 2018 11:14:21 -0700 Subject: [PATCH 47/64] [OMPD] Fix ompd_types.h --- libompd/src/ompd_types.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libompd/src/ompd_types.h b/libompd/src/ompd_types.h index a3e3b15c0..f6c62d566 100644 --- a/libompd/src/ompd_types.h +++ b/libompd/src/ompd_types.h @@ -3,7 +3,6 @@ */ #ifndef __OPMD_TYPES_H #define __OPMD_TYPES_H -#include "omp_types.h" #include "ompd.h" #define OMPD_TYPES_VERSION 20170927 /* YYYYMMDD Format */ From b2be2d6965d022c833e9601031983426fcec4c98 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 7 Sep 2018 13:42:42 -0700 Subject: [PATCH 48/64] [OMPD] Add some support for tasks for cuda --- libompd/gdb-wrapper/OMPDCommand.cpp | 50 ++++++- libompd/src/omp-debug.cpp | 197 ++++++++++++++++++++++------ 2 files changed, 204 insertions(+), 43 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index a30c880c7..686120dfb 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -653,7 +653,7 @@ bool odbCheckParallelNumThreads(OMPDFunctionsPtr functions, vector ths) { sout << "Checking of task IDs has been disable for upgrade of ompd in branch ompd-devices\n"; - // MARKER_MR: TODO: fix checking of task ids + // TODO(mr): fix checking of task ids return true; #if 0 bool res=true; @@ -686,7 +686,7 @@ vector odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_t { task_handles.push_back(task_handle); ret = functions->ompd_get_generating_task_handle( - task_handle, &task_handle); // MARKER_MR: TODO: is it generating or scheduling task or something different? + task_handle, &task_handle); // Is it generating or scheduling task or something different? } return task_handles; } @@ -1028,6 +1028,52 @@ void OMPDTasks::execute() const for (auto thread: host_thread_handles) { functions->ompd_release_thread_handle(thread); } + + // Cuda tasks + CudaGdb cuda; + auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle); + auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles); + std::map, + OMPDTaskHandleCmp> cuda_task_handles(task_cmp_op); + for (auto t: cuda_thread_handles) { + for (auto task_handle: odbGetTaskRegions(functions, t)) { + cuda_task_handles[task_handle].push_back(t); + } + } +printf("cuda tasks: %i\n", cuda_task_handles.size()); + + printf("\nCUDA TASKS\n"); + printf("Task Handle Assoc. Threads ICV Level\n"); + printf("----------------------------------------\n"); + + // For instantiation, it doesnt matter which device handle we use for + // OMPDIcvs, just use the first one + + OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle); + + for (auto th: cuda_task_handles) { + ompd_parallel_handle_t *ph; + ret = functions->ompd_get_task_parallel_handle(th.first, &ph); + if (ret != ompd_rc_ok) { + printf("could not get parallel handle for nesting\n"); + continue; + } + + ompd_word_t icv_level; + cudaIcvs.get(ph, "levels-var", &icv_level); + + printf("%-11p %-14zu %ld\n", th.first, th.second.size(), icv_level); + functions->ompd_release_parallel_handle(ph); + } + + for (auto task: cuda_task_handles) { + functions->ompd_release_task_handle(task.first); + } + + for (auto thread: cuda_thread_handles) { + functions->ompd_release_thread_handle(thread); + } } const char *OMPDTasks::toString() const diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 8bbc6236b..f2aa38fc8 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -557,13 +557,46 @@ ompd_rc_t ompd_get_task_parallel_handle( ompd_address_t taddr; ompd_rc_t ret; - ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .getAddress(&taddr); + if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + TValue parallelHandle; + auto possibleTaskDescr = TValue(context, task_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + ret = possibleTaskDescr.dereference() + .getAddress(&taddr); + if (ret != ompd_rc_ok) { + if (taddr.address == 0) { + parallelHandle = TValue(context, NULL, + "omptarget_nvptx_threadPrivateContext") + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_levelZeroParallelInfo") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } else { + return ret; + } + } else { + parallelHandle = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("enclosed_parallel") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } + ret = parallelHandle.getAddress(&taddr); + } else { + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .getAddress(&taddr); + } if (ret != ompd_rc_ok) return ret; @@ -575,6 +608,7 @@ ompd_rc_t ompd_get_task_parallel_handle( (*enclosing_parallel_handle)->ah = task_handle->ah; (*enclosing_parallel_handle)->lwt = task_handle->lwt; (*enclosing_parallel_handle)->th = taddr; + (*enclosing_parallel_handle)->cuda_kernel_info = task_handle->cuda_kernel_info; return ompd_rc_ok; } @@ -629,26 +663,33 @@ ompd_rc_t ompd_get_current_task_handle( assert(callbacks && "Callback table not initialized!"); ompd_address_t taddr, lwt; + ompd_rc_t ret = ompd_rc_ok; - TValue taskdata = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ - .cast("kmp_taskdata_t", 1); + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ompd_rc_t ret = taskdata.dereference().getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; + if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + lwt.address = 0; + taddr = thread_handle->th; + } else { + TValue taskdata = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ + .cast("kmp_taskdata_t", 1); - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = taskdata - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); + ret = taskdata.dereference().getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + ret = taskdata + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + } if (ret != ompd_rc_ok) return ret; @@ -660,6 +701,7 @@ ompd_rc_t ompd_get_current_task_handle( (*task_handle)->th = taddr; (*task_handle)->lwt = lwt; (*task_handle)->ah = thread_handle->ah; + (*task_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info; return ompd_rc_ok; } @@ -667,6 +709,11 @@ ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ) { + // Generating and Scheduling task are the same on cuda? + if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + return ompd_get_scheduling_task_handle(task_handle, parent_task_handle); + } + if (!task_handle) return ompd_rc_stale_handle; if (!task_handle->ah) @@ -737,16 +784,31 @@ ompd_rc_t ompd_get_scheduling_task_handle( assert(callbacks && "Callback table not initialized!"); ompd_address_t taddr; + ompd_rc_t ret; - ompd_rc_t ret = - TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("ompt_task_info") // td->ompt_task_info - .cast("ompt_task_info_t") - .access("scheduling_parent") // td->ompd_task_info.scheduling_parent - .cast("kmp_taskdata_t", 1) - .dereference() - .getAddress(&taddr); + if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + ret = TValue(context, task_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .dereference() + .getAddress(&taddr); + if (taddr.address == 0) { + return ompd_rc_unavailable; + } + } else { + ret = + TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("ompt_task_info") // td->ompt_task_info + .cast("ompt_task_info_t") + .access("scheduling_parent") // td->ompd_task_info.scheduling_parent + .cast("kmp_taskdata_t", 1) + .dereference() + .getAddress(&taddr); + } if (ret != ompd_rc_ok) return ret; @@ -757,6 +819,7 @@ ompd_rc_t ompd_get_scheduling_task_handle( (*parent_task_handle)->th = taddr; (*parent_task_handle)->ah = task_handle->ah; + (*parent_task_handle)->cuda_kernel_info = task_handle->cuda_kernel_info; return ret; } @@ -778,13 +841,25 @@ ompd_rc_t ompd_get_task_in_parallel( ompd_rc_t ret; ompd_address_t taddr; - ret = TValue(context, parallel_handle->th) /* t */ - .cast("kmp_base_team_t", 0) - .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ - .cast("kmp_taskdata_t", 1) - .getArrayElement( - nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ - .getAddress(&taddr); + + if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + ret = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_paralel_info", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getArrayElement(nth_handle) + .getAddress(&taddr); + } else { + ret = TValue(context, parallel_handle->th) /* t */ + .cast("kmp_base_team_t", 0) + .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ + .cast("kmp_taskdata_t", 1) + .getArrayElement( + nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ + .getAddress(&taddr); + } if (ret != ompd_rc_ok) return ret; @@ -795,6 +870,7 @@ ompd_rc_t ompd_get_task_in_parallel( (*task_handle)->th = taddr; (*task_handle)->ah = parallel_handle->ah; + (*task_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info; return ret; } @@ -816,7 +892,10 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, return ompd_rc_stale_handle; if (!task_handle_2) return ompd_rc_stale_handle; - if (task_handle_1->th.address - task_handle_2->th.address) + if (task_handle_1->ah->kind != task_handle_2->ah->kind) + return ompd_rc_bad_input; + if (task_handle_1->th.address - task_handle_2->th.address || + task_handle_1->ah->kind == OMP_DEVICE_KIND_CUDA) *cmp_value = task_handle_1->th.address - task_handle_2->th.address; else *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; @@ -990,7 +1069,43 @@ ompd_rc_t ompd_get_thread_id( ompd_rc_t ret; if (kind != OMPD_THREAD_ID_CUDALOGICAL) { - ret = ompd_rc_unsupported; + if (sizeof_thread_id != sizeof(ompd_cudathread_coord_t)) { + return ompd_rc_bad_input; + } + ompd_cudathread_coord_t *cuda_thread_id = + (ompd_cudathread_coord_t*)thread_id; + cuda_thread_id->cudaDevId = thread_handle->cuda_kernel_info->cudaDevId; + cuda_thread_id->cudaContext = thread_handle->cuda_kernel_info->cudaContext; + cuda_thread_id->warpSize = thread_handle->cuda_kernel_info->warpSize; + cuda_thread_id->gridId = thread_handle->cuda_kernel_info->gridId; + + auto threadInfo = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("ompd_thread_info") + .cast("ompd_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + + ret = threadInfo.access("threadIdx_x") + .castBase() + .getValue(cuda_thread_id->threadIdx.x); + + if (ret != ompd_rc_ok) + return ret; + + cuda_thread_id->threadIdx.y = cuda_thread_id->threadIdx.z = 0; + + ret = threadInfo.access("blockIdx_x") + .castBase() + .getValue(cuda_thread_id->blockIdx.x); + + if (ret != ompd_rc_ok) + return ret; + + cuda_thread_id->blockIdx.y = cuda_thread_id->blockIdx.z = 0; + + // TODO (mr) add gridDim and blockDim + return ompd_rc_ok; } else { ompd_size_t size; ret = tf.getType(context, "kmp_thread_t").getSize(&size); From 24e9595cc845346b36dc5515d7ee58a968146246 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Mon, 17 Sep 2018 10:17:10 -0700 Subject: [PATCH 49/64] [OMPD] Fix parallel level for explicit tasks --- libompd/gdb-wrapper/OMPDCommand.cpp | 11 ++- libompd/src/omp-debug.cpp | 87 ++++++++++--------- .../deviceRTLs/nvptx/src/ompd-specific.cu | 14 ++- .../deviceRTLs/nvptx/src/ompd-specific.h | 5 ++ .../deviceRTLs/nvptx/src/omptarget-nvptx.h | 3 +- libomptarget/deviceRTLs/nvptx/src/parallel.cu | 1 + libomptarget/deviceRTLs/nvptx/src/task.cu | 4 +- 7 files changed, 77 insertions(+), 48 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 686120dfb..59a68fa58 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -1041,11 +1041,10 @@ void OMPDTasks::execute() const cuda_task_handles[task_handle].push_back(t); } } -printf("cuda tasks: %i\n", cuda_task_handles.size()); printf("\nCUDA TASKS\n"); - printf("Task Handle Assoc. Threads ICV Level\n"); - printf("----------------------------------------\n"); + printf("Task Handle Assoc. Threads ICV Level task function\n"); + printf("--------------------------------------------------------\n"); // For instantiation, it doesnt matter which device handle we use for // OMPDIcvs, just use the first one @@ -1063,7 +1062,11 @@ printf("cuda tasks: %i\n", cuda_task_handles.size()); ompd_word_t icv_level; cudaIcvs.get(ph, "levels-var", &icv_level); - printf("%-11p %-14zu %ld\n", th.first, th.second.size(), icv_level); + ompd_address_t task_func_addr; + task_func_addr.address = 0; + functions->ompd_get_task_function(th.first, &task_func_addr); + + printf("%-11p %-14zu %-8ld %p\n", th.first, th.second.size(), icv_level, (void*)task_func_addr.address); functions->ompd_release_parallel_handle(ph); } diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index f2aa38fc8..d4df64df7 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -57,11 +57,6 @@ ompd_process_initialize(ompd_address_space_context_t ompd_rc_t ret = initTypeSizes(context); if (ret != ompd_rc_ok) return ret; - *addrhandle = new ompd_address_space_handle_t; - if (!addrhandle) - return ompd_rc_error; - (*addrhandle)->context = context; - (*addrhandle)->kind = OMP_DEVICE_KIND_HOST; ret = TValue(context, "ompd_state") .castBase(ompd_type_long_long) @@ -72,7 +67,6 @@ ompd_process_initialize(ompd_address_space_context_t (void **)(addrhandle)); if (ret != ompd_rc_ok) return ret; -// *addrhandle = new ompd_address_space_handle_t; if (!addrhandle) return ompd_rc_error; (*addrhandle)->context = context; @@ -560,14 +554,27 @@ ompd_rc_t ompd_get_task_parallel_handle( if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { TValue parallelHandle; + // The ompd_parallel_info_t we need is only present in the previous task + // of an implicit task. + uint16_t task_is_implicit = 0; + ret = ompd_rc_ok; auto possibleTaskDescr = TValue(context, task_handle->th) - .cast("omptarget_nvptx_TaskDescr", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .access("prev") - .cast("omptarget_nvptx_TaskDescr", 1, - OMPD_SEGMENT_CUDA_PTX_GLOBAL); - ret = possibleTaskDescr.dereference() - .getAddress(&taddr); + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + + while (!task_is_implicit && ret == ompd_rc_ok) { + ret = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("task_implicit") + .castBase() + .getValue(task_is_implicit); + possibleTaskDescr = possibleTaskDescr.access("prev") + .cast("omptarget_nvptx_TaskDescr", + 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL); + ret = possibleTaskDescr.dereference().getAddress(&taddr); + } + if (ret != ompd_rc_ok) { if (taddr.address == 0) { parallelHandle = TValue(context, NULL, @@ -1083,7 +1090,7 @@ ompd_rc_t ompd_get_thread_id( .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) .access("ompd_thread_info") - .cast("ompd_thread_info_t", 0, + .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL); ret = threadInfo.access("threadIdx_x") @@ -1234,7 +1241,6 @@ ompd_rc_t ompd_get_task_frame( return ret; } -#if 1 // the runtime currently does not have task function information ompd_rc_t ompd_get_task_function( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_address_t *task_addr /* OUT: first instruction in the task region */ @@ -1251,33 +1257,36 @@ ompd_rc_t ompd_get_task_function( return ompd_rc_needs_state_tracking; assert(callbacks && "Callback table not initialized!"); - -#if 0 - /* We don't have a task function for implicit tasks */ - ompd_word_t implicit; - ompd_rc_t ret = ompd_is_implicit (task_handle, &implicit); - if (ret != ompd_rc_ok) - return ret; - if (implicit) - return ompd_rc_bad_input; -#else ompd_rc_t ret; -#endif - task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; - TValue taskInfo; - if(task_handle->lwt.address!=0) - return ompd_rc_bad_input; // We need to decide what we do here. - else - ret = TValue(context, task_handle->th). - cast("kmp_taskdata_t",0). /*t*/ - getArrayElement(1). /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ - cast("kmp_task_t",0). /* (kmp_task_t *) */ - access("routine"). /*td->ompt_task_info*/ - castBase(). - getValue(task_addr->address); + + if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; + ret = TValue(context, task_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("task_function") + .castBase() + .getValue(task_addr->address); + + } else { + task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; + TValue taskInfo; + if(task_handle->lwt.address!=0) + return ompd_rc_bad_input; // We need to decide what we do here. + else + ret = TValue(context, task_handle->th). + cast("kmp_taskdata_t",0). /*t*/ + getArrayElement(1). /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ + cast("kmp_task_t",0). /* (kmp_task_t *) */ + access("routine"). /*td->ompt_task_info*/ + castBase(). + getValue(task_addr->address); + } return ret; } -#endif /* --- --- OMPD Version and Compatibility Information ----------------------- */ diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index 61bb8413c..89b921494 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -64,9 +64,12 @@ __device__ void ompd_init ( void ) ompd_target_initialized = 1; } -INLINE void ompd_init_thread(omptarget_nvptx_TaskDescr *currTaskDescr) { +INLINE void ompd_init_thread(omptarget_nvptx_TaskDescr *currTaskDescr, + void *task_func, uint8_t implicit) { currTaskDescr->ompd_thread_info.blockIdx_x = blockIdx.x; currTaskDescr->ompd_thread_info.threadIdx_x = threadIdx.x; + currTaskDescr->ompd_thread_info.task_function = task_func; + currTaskDescr->ompd_thread_info.task_implicit = implicit; } __device__ void ompd_set_device_specific_thread_state( @@ -80,16 +83,21 @@ __device__ void ompd_set_device_thread_state(omp_state_t state) { __device__ void ompd_init_thread_parallel() { omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); - ompd_init_thread(currTaskDescr); + ompd_init_thread(currTaskDescr, omptarget_nvptx_workFn, 1); ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_parallel); } __device__ void ompd_init_thread_master() { omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); - ompd_init_thread(currTaskDescr); + ompd_init_thread(currTaskDescr, NULL, 1); ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_serial); } +__device__ void ompd_init_explicit_task(void *task_func) { + omptarget_nvptx_TaskDescr *taskDescr = getMyTopTaskDescriptor(); + ompd_init_thread(taskDescr, task_func, 0); +} + __device__ void ompd_bp_parallel_begin (){ asm (""); } __device__ void ompd_bp_parallel_end (){ asm (""); } __device__ void ompd_bp_task_begin (){ asm (""); } diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 64a2bf5f3..2a2feb727 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -28,6 +28,8 @@ extern "C" __device__ void ompd_bp_task_end ( void ); OMPD_ACCESS(ompd_nvptx_thread_info_t,state) \ OMPD_ACCESS(ompd_nvptx_thread_info_t,threadIdx_x) \ OMPD_ACCESS(ompd_nvptx_thread_info_t,enclosed_parallel) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,task_function) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,task_implicit) \ OMPD_ACCESS(ompd_nvptx_parallel_info_t,level) \ OMPD_ACCESS(ompd_nvptx_parallel_info_t,parallel_tasks) @@ -53,6 +55,7 @@ __device__ void ompd_set_device_specific_thread_state( omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state); __device__ void ompd_set_device_thread_state(omp_state_t state); __device__ void ompd_init_thread_parallel(); +__device__ void ompd_init_explicit_task(void *task_func); INLINE void ompd_reset_device_thread_state() { ompd_set_device_thread_state(omp_state_work_serial); @@ -83,6 +86,8 @@ typedef struct { // simply store ThreadIdx.x and BlockIdx.x uint16_t threadIdx_x; ompd_nvptx_parallel_info_t enclosed_parallel; + void *task_function; + uint16_t task_implicit; } ompd_nvptx_thread_info_t; #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 0cd65a502..88daa79d4 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -154,7 +154,8 @@ class omptarget_nvptx_TaskDescr { #if OMPD_SUPPORT friend void __device__ ompd_init( void ); friend INLINE void ompd_init_thread( - omptarget_nvptx_TaskDescr *currTaskDescr); + omptarget_nvptx_TaskDescr *currTaskDescr, void *task_func, + uint8_t implicit); friend __device__ void ompd_set_device_specific_thread_state( omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state); #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index f4e115614..655a13488 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -221,6 +221,7 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; +printf("__kmpc_kernel_prepare_parallel workFn=%p\n", WorkFn); if (!IsOMPRuntimeInitialized) return; diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/nvptx/src/task.cu index 8d4796778..924e5262e 100644 --- a/libomptarget/deviceRTLs/nvptx/src/task.cu +++ b/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -97,7 +97,9 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); - +#ifdef OMPD_SUPPORT + ompd_init_explicit_task((void*)(newKmpTaskDescr->sub)); +#endif // 3. call sub PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr)); From d15db7631b4d9310b15d1f6949e50802db9ea75b Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 18 Sep 2018 16:20:09 -0700 Subject: [PATCH 50/64] [OMPD] omp_device_t -> ompd_device_t --- libompd/gdb-wrapper/OMPDCommand.cpp | 2 +- libompd/gdb-wrapper/ompd_typedefs.h | 4 +-- libompd/src/omp-debug.cpp | 30 +++++++++++----------- libompd/src/omp-debug.h | 2 +- libompd/src/omp-icv.cpp | 8 +++--- libompd/src/omp-state.cpp | 2 +- libompd/src/ompd-private.h | 2 +- libompd/src/{ompd_types.h => ompd-types.h} | 8 +++--- libompd/src/ompd.h | 2 +- 9 files changed, 30 insertions(+), 30 deletions(-) rename libompd/src/{ompd_types.h => ompd-types.h} (91%) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index 59a68fa58..dc4ab9af6 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -549,7 +549,7 @@ map odbInitCudaDevices(OMPDFunctionsPtr functions functions->ompd_device_initialize( addrhandle, ret.at(i.coord.cudaContext).getGlobalOmpdContext(), - OMP_DEVICE_KIND_CUDA, + OMPD_DEVICE_KIND_CUDA, sizeof(i.coord.cudaContext), &i.coord.cudaContext, &ret.at(i.coord.cudaContext).ompd_device_handle); diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h index 837b943f8..cdaca39cb 100644 --- a/libompd/gdb-wrapper/ompd_typedefs.h +++ b/libompd/gdb-wrapper/ompd_typedefs.h @@ -8,7 +8,7 @@ // TODO: (mr) I dont have time to change every thread id kind, so this is some compat stuff #define ompd_thread_id_pthread OMPD_THREAD_ID_PTHREAD #define ompd_thread_id_cudalogical OMPD_THREAD_ID_CUDALOGICAL -#define ompd_device_kind_cuda OMP_DEVICE_KIND_CUDA +#define ompd_device_kind_cuda OMPD_DEVICE_KIND_CUDA typedef ompd_rc_t (*ompd_initialize_fn_t) ( ompd_word_t api_version, @@ -37,7 +37,7 @@ typedef ompd_rc_t (*ompd_process_initialize_fn_t) ( typedef ompd_rc_t (*ompd_device_initialize_fn_t) ( ompd_address_space_handle_t *process_handle, /*IN: address space of the OpenMP process*/ ompd_address_space_context_t *device_context, /*IN: Opaque tool handle for device address space*/ - omp_device_t kind, /*IN: device identifier kind*/ + ompd_device_t kind, /*IN: device identifier kind*/ ompd_size_t sizeof_id, /*IN: size of device identifier*/ void *id, /*IN: device identifier*/ ompd_address_space_handle_t **device_handle /*OUT: device handle*/ diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index d4df64df7..ad71571c4 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -70,7 +70,7 @@ ompd_process_initialize(ompd_address_space_context_t if (!addrhandle) return ompd_rc_error; (*addrhandle)->context = context; - (*addrhandle)->kind = OMP_DEVICE_KIND_HOST; + (*addrhandle)->kind = OMPD_DEVICE_KIND_HOST; return ompd_rc_ok; } @@ -163,7 +163,7 @@ ompd_rc_t ompd_device_initialize( if (!device_handle) return ompd_rc_error; (*device_handle)->context = device_context; - (*device_handle)->kind = OMP_DEVICE_KIND_CUDA; + (*device_handle)->kind = OMPD_DEVICE_KIND_CUDA; (*device_handle)->id = (uint64_t)id; return ompd_rc_ok; } @@ -195,7 +195,7 @@ ompd_rc_t ompd_get_thread_in_parallel( ompd_address_t taddr; - if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { uint16_t thread_idx; // We cannot use the task descriptor associated with the parallel info as // their task might not be currently active @@ -280,7 +280,7 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, if (thread_handle_1->ah->kind != thread_handle_2->ah->kind) return ompd_rc_bad_input; *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; - if (*cmp_value == 0 && thread_handle_1->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (*cmp_value == 0 && thread_handle_1->ah->kind == OMPD_DEVICE_KIND_CUDA) { *cmp_value = thread_handle_1->cuda_kernel_info->cudaDevId - thread_handle_2->cuda_kernel_info->cudaDevId; if (*cmp_value == 0) { @@ -321,7 +321,7 @@ ompd_rc_t ompd_get_current_parallel_handle( ompd_rc_t ret; - if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { ompd_address_t taddr; TValue prevTask = TValue(context, thread_handle->th) .cast("omptarget_nvptx_TaskDescr", 0) @@ -416,7 +416,7 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( ompd_address_t taddr = parallel_handle->th, lwt; ompd_rc_t ret; - if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { uint16_t level; TValue curParallelInfo = TValue(context, taddr) .cast("ompd_nvptx_parallel_info_t", 0, @@ -552,7 +552,7 @@ ompd_rc_t ompd_get_task_parallel_handle( ompd_rc_t ret; - if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { TValue parallelHandle; // The ompd_parallel_info_t we need is only present in the previous task // of an implicit task. @@ -640,7 +640,7 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, return ompd_rc_stale_handle; if (parallel_handle_1->ah->kind != parallel_handle_2->ah->kind) return ompd_rc_bad_input; - if (parallel_handle_1->ah->kind == OMP_DEVICE_KIND_HOST) { + if (parallel_handle_1->ah->kind == OMPD_DEVICE_KIND_HOST) { if (parallel_handle_1->th.address - parallel_handle_2->th.address) *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; else @@ -674,7 +674,7 @@ ompd_rc_t ompd_get_current_task_handle( lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { lwt.address = 0; taddr = thread_handle->th; } else { @@ -717,7 +717,7 @@ ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ) { // Generating and Scheduling task are the same on cuda? - if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { return ompd_get_scheduling_task_handle(task_handle, parent_task_handle); } @@ -793,7 +793,7 @@ ompd_rc_t ompd_get_scheduling_task_handle( ompd_address_t taddr; ompd_rc_t ret; - if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { ret = TValue(context, task_handle->th) .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) @@ -849,7 +849,7 @@ ompd_rc_t ompd_get_task_in_parallel( ompd_rc_t ret; ompd_address_t taddr; - if (parallel_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { ret = TValue(context, parallel_handle->th) .cast("ompd_nvptx_paralel_info", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) @@ -902,7 +902,7 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, if (task_handle_1->ah->kind != task_handle_2->ah->kind) return ompd_rc_bad_input; if (task_handle_1->th.address - task_handle_2->th.address || - task_handle_1->ah->kind == OMP_DEVICE_KIND_CUDA) + task_handle_1->ah->kind == OMPD_DEVICE_KIND_CUDA) *cmp_value = task_handle_1->th.address - task_handle_2->th.address; else *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; @@ -1156,7 +1156,7 @@ ompd_rc_t ompd_get_state( ompd_rc_t ret; assert(callbacks && "Callback table not initialized!"); - if (thread_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { if (wait_id) *wait_id = 0; //TODO: (mr) implement wait_ids in nvptx device rtl ret = TValue(context, thread_handle->th) @@ -1259,7 +1259,7 @@ ompd_rc_t ompd_get_task_function( assert(callbacks && "Callback table not initialized!"); ompd_rc_t ret; - if (task_handle->ah->kind == OMP_DEVICE_KIND_CUDA) { + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; ret = TValue(context, task_handle->th) .cast("omptarget_nvptx_TaskDescr", 0, diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index 706f91644..61b809c25 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -62,7 +62,7 @@ typedef struct _ompd_process_handle_s { typedef struct _ompd_address_space_handle_s { ompd_address_space_context_t *context; - omp_device_t kind; + ompd_device_t kind; uint64_t id; } ompd_address_space_handle_t; diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp index 54207001c..72598ad09 100644 --- a/libompd/src/omp-icv.cpp +++ b/libompd/src/omp-icv.cpp @@ -87,7 +87,7 @@ ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, if (!handle) { return ompd_rc_stale_handle; } - if (handle->kind == OMP_DEVICE_KIND_CUDA) { + if (handle->kind == OMPD_DEVICE_KIND_CUDA) { return ompd_enumerate_icvs_cuda(current, next_id, next_icv_name, next_scope, more); } @@ -446,7 +446,7 @@ ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, return ompd_rc_bad_input; } - omp_device_t device_kind; + ompd_device_t device_kind; switch (scope) { case ompd_scope_thread: @@ -466,7 +466,7 @@ ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, } - if (device_kind == OMP_DEVICE_KIND_HOST) { + if (device_kind == OMPD_DEVICE_KIND_HOST) { switch (icv_id) { case ompd_icv_levels_var: return ompd_get_level((ompd_parallel_handle_t *)handle, icv_value); @@ -491,7 +491,7 @@ ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, default: return ompd_rc_unsupported; } - } else if (device_kind == OMP_DEVICE_KIND_CUDA) { + } else if (device_kind == OMPD_DEVICE_KIND_CUDA) { switch (icv_id) { case ompd_icv_levels_var: return ompd_get_level_cuda((ompd_parallel_handle_t *)handle, icv_value); diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp index df117a05b..0e64aad95 100644 --- a/libompd/src/omp-state.cpp +++ b/libompd/src/omp-state.cpp @@ -35,7 +35,7 @@ ompd_rc_t ompd_enumerate_states( ompd_word_t current_state, ompd_word_t *next_state, const char **next_state_name, ompd_word_t *more_enums) { ompd_rc_t ret; - if (address_space_handle->kind == OMP_DEVICE_KIND_CUDA) { + if (address_space_handle->kind == OMPD_DEVICE_KIND_CUDA) { // We only support a small number of states for cuda devices *more_enums = 1; switch (current_state) { diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h index 7210980aa..bc5a04794 100644 --- a/libompd/src/ompd-private.h +++ b/libompd/src/ompd-private.h @@ -69,6 +69,6 @@ typedef enum ompd_target_prim_types_t { ompd_type_max } ompd_target_prim_types_t; -#include "ompd_types.h" +#include "ompd-types.h" #endif /*SRC_OMPD_PRIVATE_H*/ diff --git a/libompd/src/ompd_types.h b/libompd/src/ompd-types.h similarity index 91% rename from libompd/src/ompd_types.h rename to libompd/src/ompd-types.h index f6c62d566..ea5aedef4 100644 --- a/libompd/src/ompd_types.h +++ b/libompd/src/ompd-types.h @@ -56,10 +56,10 @@ typedef struct ompd_cudathread_coord_t { #define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) /* Kinds of device device address spaces */ -#define OMP_DEVICE_KIND_HOST ((omp_device_t)1) -#define OMP_DEVICE_KIND_CUDA ((omp_device_t)2) +#define OMPD_DEVICE_KIND_HOST ((ompd_device_t)1) +#define OMPD_DEVICE_KIND_CUDA ((ompd_device_t)2) /* The range of non-standard implementation defined values */ -#define OMP_DEVICE_IMPL_LO ((omp_device_t)1000000) -#define OMP_DEVICE_IMPL_HI ((omp_device_t)1100000) +#define OMPD_DEVICE_IMPL_LO ((ompd_device_t)1000000) +#define OMPD_DEVICE_IMPL_HI ((ompd_device_t)1100000) #endif diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 1d78151e8..48ae79e98 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -59,7 +59,7 @@ typedef struct ompd_address_t { const uint64_t ompd_segment_none = 0; /* types for device and thread id KIND, not for the actual thread/device id */ -typedef uint64_t omp_device_t; +typedef uint64_t ompd_device_t; typedef uint64_t ompd_thread_id_t; /** From 58e5df5707819559639a55b8fd4e14018d0aeed5 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 21 Sep 2018 08:34:08 -0700 Subject: [PATCH 51/64] [OMPD] Add examples to test ompd on cuda --- libompd/cuda_examples/test_target_generic.c | 33 ++++++++++++++ .../cuda_examples/test_target_multilevel.c | 43 +++++++++++++++++++ .../cuda_examples/test_target_noparallel.c | 24 +++++++++++ libompd/cuda_examples/test_target_single.c | 29 +++++++++++++ libompd/cuda_examples/test_target_spmd.c | 31 +++++++++++++ libompd/cuda_examples/test_target_task.c | 33 ++++++++++++++ 6 files changed, 193 insertions(+) create mode 100644 libompd/cuda_examples/test_target_generic.c create mode 100644 libompd/cuda_examples/test_target_multilevel.c create mode 100644 libompd/cuda_examples/test_target_noparallel.c create mode 100644 libompd/cuda_examples/test_target_single.c create mode 100644 libompd/cuda_examples/test_target_spmd.c create mode 100644 libompd/cuda_examples/test_target_task.c diff --git a/libompd/cuda_examples/test_target_generic.c b/libompd/cuda_examples/test_target_generic.c new file mode 100644 index 000000000..db881da5e --- /dev/null +++ b/libompd/cuda_examples/test_target_generic.c @@ -0,0 +1,33 @@ +// Testing generic mode of nvptx devRtl +#include + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + //init(v1, v2, N); + #pragma omp target map(v1, v2, p) + { + test_breakpoint(); + #pragma omp parallel for + for (i=0; i +#include + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + omp_set_nested(1); + #pragma omp target map(v1, v2, p) + { + omp_set_nested(1); + #pragma omp parallel shared(v1, v2, p, N) num_threads(4) + { + printf("Outer region - thread ID: %d\n", omp_get_thread_num()); + #pragma omp for + for (int i = 0; i < N; ++i) + { + float acc = 0; + #pragma omp parallel shared(v1, v2, p, N) num_threads(4) + #pragma omp for + for(int j = 0; j < N; ++j) + { + test_breakpoint(); + p[i] += v1[i] + v2[i]; + } + } + } + printf("End of target region\n"); + } +//output(p, N); +} +int main() { + printf("calling vec_mul...\n"); + vec_mult(64); + printf("done\n"); + return 0; +} diff --git a/libompd/cuda_examples/test_target_noparallel.c b/libompd/cuda_examples/test_target_noparallel.c new file mode 100644 index 000000000..2e2f2f51c --- /dev/null +++ b/libompd/cuda_examples/test_target_noparallel.c @@ -0,0 +1,24 @@ +#include + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + #pragma omp target map(v1, v2, p) + { + test_breakpoint(); + p[0] = v[0] * v[0]; + } +} +int main() { + printf("calling vec_mul...\n"); + vec_mult(64); + printf("done\n"); + return 0; +} diff --git a/libompd/cuda_examples/test_target_single.c b/libompd/cuda_examples/test_target_single.c new file mode 100644 index 000000000..4a2bc3260 --- /dev/null +++ b/libompd/cuda_examples/test_target_single.c @@ -0,0 +1,29 @@ +#include + +#pragma omp declare target +float mult(float u, float v) { + return u * v; +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + //init(v1, v2, N); + #pragma omp target map(v1, v2, p) + { + #pragma omp parallel for + for (i=0; i + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + //init(v1, v2, N); + #pragma omp target map(v1, v2, p) + { + #pragma omp parallel for + for (i=0; i +#include + +#pragma omp declare target +void task1() { + printf("Hello from Task 1\n"); + uint32_t enter_frame = 0; + for(;1;) { + } +} +void task2() { + printf("Hello from Task 2\n"); + for(;1;) { + } +} +#pragma omp end declare target + +int main() { + #pragma omp target + { + #pragma omp parallel num_threads(4) + { + #pragma omp single + { + #pragma omp task + task1(); + #pragma omp task + task2(); + } + } + } + return 0; +} From 42982e48fe4737edf617a4831920dd8b70335bfe Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 21 Sep 2018 08:41:28 -0700 Subject: [PATCH 52/64] [OMPD] Add ompd breakpoints --- libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu | 2 ++ libomptarget/deviceRTLs/nvptx/src/ompd-specific.h | 4 +++- .../deviceRTLs/nvptx/src/omptarget-nvptx.cu | 13 +++++++++++++ libomptarget/deviceRTLs/nvptx/src/parallel.cu | 12 +++++++++++- libomptarget/deviceRTLs/nvptx/src/task.cu | 5 +++++ 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu index 89b921494..3cc18b908 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -102,4 +102,6 @@ __device__ void ompd_bp_parallel_begin (){ asm (""); } __device__ void ompd_bp_parallel_end (){ asm (""); } __device__ void ompd_bp_task_begin (){ asm (""); } __device__ void ompd_bp_task_end (){ asm (""); } +__device__ void ompd_bp_thread_begin (){ asm (""); } +__device__ void ompd_bp_thread_end (){ asm (""); } #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 2a2feb727..91e43b747 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -16,6 +16,8 @@ extern "C" __device__ void ompd_bp_parallel_begin ( void ); extern "C" __device__ void ompd_bp_parallel_end ( void ); extern "C" __device__ void ompd_bp_task_begin ( void ); extern "C" __device__ void ompd_bp_task_end ( void ); +extern "C" __device__ void ompd_bp_thread_begin ( void ); +extern "C" __device__ void ompd_bp_thread_end ( void ); #define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \ @@ -87,7 +89,7 @@ typedef struct { uint16_t threadIdx_x; ompd_nvptx_parallel_info_t enclosed_parallel; void *task_function; - uint16_t task_implicit; + uint8_t task_implicit; } ompd_nvptx_thread_info_t; #endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index 8a1f3e05d..f3202a2bb 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -95,6 +95,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { #ifdef OMPD_SUPPORT ompd_init(); ompd_init_thread_master(); + ompd_bp_thread_begin(); #endif /*OMPD_SUPPORT*/ } @@ -109,6 +110,9 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { omptarget_nvptx_device_State[slot].Enqueue( omptarget_nvptx_threadPrivateContext); } +#ifdef OMPD_SUPPORT + ompd_bp_thread_end(); +#endif // Done with work. Kill the workers. omptarget_nvptx_workFn = 0; } @@ -144,6 +148,8 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, workDescr.CounterGroup().Reset(); #ifdef OMPD_SUPPORT ompd_init(); + ompd_bp_parallel_begin(); // This should be placed later, but the parallel + // handle is ready from here on. #endif /*OMPD_SUPPORT*/ } __syncthreads(); @@ -183,6 +189,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, #ifdef OMPD_SUPPORT ompd_init_thread_parallel(); // __kmpc_kernel_parallel() is not called in // spmd mode + ompd_bp_thread_begin(); #endif } @@ -190,8 +197,14 @@ EXTERN void __kmpc_spmd_kernel_deinit() { // We're not going to pop the task descr stack of each thread since // there are no more parallel regions in SPMD mode. __syncthreads(); +#ifdef OMPD_SUPPORT + ompd_bp_thread_end(); +#endif int threadId = GetThreadIdInBlock(); if (threadId == 0) { +#ifdef OMPD_SUPPORT + ompd_bp_parallel_end(); +#endif // Enqueue omp state object for use by another team. int slot = smid() % MAX_SM; omptarget_nvptx_device_State[slot].Enqueue( diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 655a13488..0446d7170 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -221,7 +221,6 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; -printf("__kmpc_kernel_prepare_parallel workFn=%p\n", WorkFn); if (!IsOMPRuntimeInitialized) return; @@ -316,6 +315,7 @@ printf("__kmpc_kernel_prepare_parallel workFn=%p\n", WorkFn); // Move the previous thread into undefined state (will be reset in __kmpc_kernel_end_parallel) // TODO (mr) find a better place to do this ompd_set_device_thread_state(omp_state_undefined); + ompd_bp_parallel_begin(); #endif /*OMPD_SUPPORT*/ // set number of threads on work descriptor @@ -374,6 +374,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn, isActive = true; #ifdef OMPD_SUPPORT ompd_init_thread_parallel(); + ompd_bp_thread_begin(); #endif /*OMPD_SUPPORT*/ } @@ -391,6 +392,10 @@ EXTERN void __kmpc_kernel_end_parallel() { threadId, currTaskDescr->GetPrevTaskDescr()); #ifdef OMPD_SUPPORT ompd_reset_device_thread_state(); + ompd_bp_thread_end(); + if (threadId == 0) { + ompd_bp_parallel_end(); + } #endif /*OMPD_SUPPORT*/ } @@ -440,6 +445,8 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { newTaskDescr); #ifdef OMPD_SUPPORT ompd_init_thread_parallel(); // we are still in a prallel region + // every thread is a parallel region.. hooray + ompd_bp_parallel_begin(); #endif /*OMPD_SUPPORT*/ } @@ -453,6 +460,9 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, // set new top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); +#ifdef OMPD_SUPPORT + ompd_bp_parallel_end(); +#endif // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); } diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/nvptx/src/task.cu index 924e5262e..76166ea8c 100644 --- a/libomptarget/deviceRTLs/nvptx/src/task.cu +++ b/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -99,6 +99,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); #ifdef OMPD_SUPPORT ompd_init_explicit_task((void*)(newKmpTaskDescr->sub)); + ompd_bp_task_begin(); #endif // 3. call sub PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", @@ -107,6 +108,10 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, PRINT(LD_TASK, "return from call task sub 0x%llx()\n", P64(newKmpTaskDescr->sub)); +#ifdef OMPD_SUPPORT + ompd_bp_task_end(); +#endif + // 4. pop context omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, parentTaskDescr); From 23f5ea07f42dae47f1401cdf60c289d3e8b172a6 Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 21 Sep 2018 10:22:47 -0700 Subject: [PATCH 53/64] [OMPD] Fix task/parallel handle interaction --- libompd/src/omp-debug.cpp | 49 +++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index ad71571c4..4a986b782 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -323,33 +323,46 @@ ompd_rc_t ompd_get_current_parallel_handle( if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { ompd_address_t taddr; - TValue prevTask = TValue(context, thread_handle->th) - .cast("omptarget_nvptx_TaskDescr", 0) - .access("prev") - .cast("omptarget_nvptx_TaskDescr", 1, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .dereference(); + TValue ph; + // The ompd_parallel_info_t we need is only present in the previous task + // of an implicit task. + uint16_t task_is_implicit = 0; + ret = ompd_rc_ok; + auto possibleTaskDescr = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); - ret = prevTask.getAddress(&taddr); + while (!task_is_implicit && ret == ompd_rc_ok) { + ret = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("task_implicit") + .castBase() + .getValue(task_is_implicit); + possibleTaskDescr = possibleTaskDescr.access("prev") + .cast("omptarget_nvptx_TaskDescr", + 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL); + ret = possibleTaskDescr.dereference().getAddress(&taddr); + } - TValue ph; if (ret != ompd_rc_ok) { if (taddr.address == 0) { - ph = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED) + ph = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext") .cast("omptarget_nvptx_ThreadPrivateContext", 1, OMPD_SEGMENT_CUDA_PTX_SHARED) - .access("ompd_levelZeroParallelInfo") - .cast("ompd_nvptx_parallel_info_t", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL); + .access("ompd_levelZeroParallelInfo") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); } else { return ret; } } else { - ph = prevTask.access("ompd_thread_info") - .cast("ompd_nvptx_thread_info_t", 0, - OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .access("enclosed_parallel"); + ph = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("enclosed_parallel") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); } ret = ph.getAddress(&taddr); @@ -561,7 +574,7 @@ ompd_rc_t ompd_get_task_parallel_handle( auto possibleTaskDescr = TValue(context, task_handle->th) .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL); - + while (!task_is_implicit && ret == ompd_rc_ok) { ret = possibleTaskDescr.access("ompd_thread_info") .cast("ompd_nvptx_thread_info_t", 0, From 8c4c7ae7e3bcd3e730bf6902569db2f0d556874d Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 25 Sep 2018 08:32:57 -0700 Subject: [PATCH 54/64] [OMPD] Make OMPD configurable for deviceRTLs --- libomptarget/deviceRTLs/nvptx/CMakeLists.txt | 4 ++++ libomptarget/deviceRTLs/nvptx/src/ompd-specific.h | 6 ++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt index 4ee4ba27c..b0fdc5b4d 100644 --- a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -126,6 +126,10 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) endif() + if(${LIBOMPTARGET_OMPD_SUPPORT}) + set(bc_flags ${bc_flags} -DOMPD_SUPPORT=1) + endif() + # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared # to handle. Therefore, we use 'weak' instead. We are compiling only for the # device, so it should be equivalent. diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h index 91e43b747..8b929e5fe 100644 --- a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -1,5 +1,5 @@ -//TODO: (mr) move this to cmake file -#define OMPD_SUPPORT 1 +#ifndef __OMPD_SPECIFIC_H__ +#define __OMPD_SPECIFIC_H__ #ifdef OMPD_SUPPORT @@ -7,8 +7,6 @@ #include "option.h" #include -#ifndef __OMPD_SPECIFIC_H__ -#define __OMPD_SPECIFIC_H__ __device__ void ompd_init( void ); From 901ff1ba052998cd28279cb0d42203e10628eecf Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 25 Sep 2018 09:17:21 -0700 Subject: [PATCH 55/64] [OMPD] Clean up by removing unnecesarry header --- libompd/src/omp-debug.h | 1 - 1 file changed, 1 deletion(-) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index 61b809c25..1e4ec43b9 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -13,7 +13,6 @@ #ifdef __cplusplus #include -#include #define OMPD_DLL_VERSION 201811; From 6a825eb77ec380ebc0d5f2a7536c7b9304c7e0ab Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Tue, 25 Sep 2018 13:59:14 -0700 Subject: [PATCH 56/64] [OMPD} Remove deprecated ompd_process_handle_t --- libompd/src/omp-debug.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index 1e4ec43b9..e803012c4 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -55,10 +55,6 @@ typedef struct _ompd_cuda_thread_kernel_info_s { typedef struct _ompd_address_space_context_s ompd_address_space_context_t; -typedef struct _ompd_process_handle_s { - ompd_address_space_context_t *context; -} ompd_process_handle_t; - typedef struct _ompd_address_space_handle_s { ompd_address_space_context_t *context; ompd_device_t kind; From bc9c1e3747f0c36fb9fc5251169f577d9e00abcb Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Thu, 27 Sep 2018 09:17:24 -0700 Subject: [PATCH 57/64] [OMPD] Remove TODOs --- libompd/gdb-wrapper/OMPDCommand.cpp | 19 ++++++++----------- libompd/gdb-wrapper/ompd_typedefs.h | 4 ---- 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index dc4ab9af6..b34b72396 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -245,7 +245,7 @@ void OMPDThreads::execute() const for(auto i: thread_ids) { ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( - addrhandle, ompd_thread_id_pthread, sizeof(i.second), + addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second), &(i.second), &thread_handle); if (ret == ompd_rc_ok) { @@ -291,7 +291,7 @@ void OMPDThreads::execute() const result = functions->ompd_device_initialize( addrhandle, cpool->getGlobalOmpdContext(), - ompd_device_kind_cuda, + OMPD_DEVICE_KIND_CUDA, sizeof(i.coord.cudaContext), &i.coord.cudaContext, &cpool->ompd_device_handle); @@ -314,7 +314,7 @@ void OMPDThreads::execute() const ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( address_spaces[i.coord.cudaContext], - ompd_thread_id_cudalogical, + OMPD_THREAD_ID_CUDALOGICAL, sizeof(i.coord), &i.coord, &thread_handle); @@ -386,7 +386,7 @@ void OMPDLevels::execute() const ompd_thread_handle_t *thread_handle; ompd_parallel_handle_t *parallel_handle; ret = functions->ompd_get_thread_handle( - addrhandle, ompd_thread_id_pthread, sizeof(i.second) ,&(i.second), &thread_handle); + addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second) ,&(i.second), &thread_handle); if (ret != ompd_rc_ok) { continue; } @@ -490,7 +490,7 @@ void OMPDApi::execute() const if (extraArgs[0] == "get_threads") { -#if 0 // MARKER_MR: TODO: reimplement this functionality with breakpoints +#if 0 if(extraArgs.size()>1) { hout << "Usage: odb api get_threads" << endl; @@ -529,7 +529,7 @@ vector odbGetThreadHandles(ompd_address_space_handle_t* a { ompd_thread_handle_t* thread_handle; ret = functions->ompd_get_thread_handle( - addrhandle, ompd_thread_id_pthread, sizeof(i.second) ,&(i.second), &thread_handle); + addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second) ,&(i.second), &thread_handle); if (ret!=ompd_rc_ok) continue; thread_handles.push_back(thread_handle); @@ -571,7 +571,7 @@ vector odbGetCudaThreadHandles( ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( device_handles.at(i.coord.cudaContext).ompd_device_handle, - ompd_thread_id_cudalogical, + OMPD_THREAD_ID_CUDALOGICAL, sizeof(i.coord), &i.coord, &thread_handle); @@ -603,7 +603,6 @@ vector odbGetParallelRegions(OMPDFunctionsPtr functions bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector phs) { sout << "Checking of parallel IDs has been disabled for upgrade of ompd in branch ompd-devices\n"; - // MARKER_MR: TODO: fix checking of parallel ids return true; #if 0 bool res=true; @@ -628,7 +627,6 @@ bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector phs) { sout << "Checking of parallel IDs has been disable for upgrade of ompd in branch ompd-devices\n"; - // MARKER_MR: TODO: fix checking of parallel ids for num threads return true; #if 0 bool res=true; @@ -653,7 +651,6 @@ bool odbCheckParallelNumThreads(OMPDFunctionsPtr functions, vector ths) { sout << "Checking of task IDs has been disable for upgrade of ompd in branch ompd-devices\n"; - // TODO(mr): fix checking of task ids return true; #if 0 bool res=true; @@ -804,7 +801,7 @@ void OMPDTest::execute() const } sout << endl; pthread_t osthread; - functions->ompd_get_thread_id(thr_h, ompd_thread_id_pthread, sizeof(pthread_t), &osthread); + functions->ompd_get_thread_id(thr_h, OMPD_THREAD_ID_PTHREAD, sizeof(pthread_t), &osthread); host_contextPool->getThreadContext(&osthread)->setThisGdbContext(); odbCheckParallelIDs(functions, parallel_h); odbCheckTaskIDs(functions, task_h); diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h index cdaca39cb..825916434 100644 --- a/libompd/gdb-wrapper/ompd_typedefs.h +++ b/libompd/gdb-wrapper/ompd_typedefs.h @@ -5,10 +5,6 @@ * Global initialization and finalization */ -// TODO: (mr) I dont have time to change every thread id kind, so this is some compat stuff -#define ompd_thread_id_pthread OMPD_THREAD_ID_PTHREAD -#define ompd_thread_id_cudalogical OMPD_THREAD_ID_CUDALOGICAL -#define ompd_device_kind_cuda OMPD_DEVICE_KIND_CUDA typedef ompd_rc_t (*ompd_initialize_fn_t) ( ompd_word_t api_version, From d861bcde29aceb08d8b59541afd9ce1bcf3db3db Mon Sep 17 00:00:00 2001 From: Manoel Roemmer Date: Fri, 28 Sep 2018 11:07:11 -0700 Subject: [PATCH 58/64] [OMPD] Add some kernel info + removed TODO notes --- libompd/src/omp-debug.cpp | 11 ++++++----- libompd/src/omp-debug.h | 3 +++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 4a986b782..970e197ae 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -130,8 +130,6 @@ ompd_rc_t ompd_device_initialize( if (!device_context) return ompd_rc_bad_input; - // TODO:(mr) primitive type sizes can be different on devices? Think about implementing that - ompd_rc_t ret; uint64_t ompd_num_cuda_devices; @@ -145,7 +143,6 @@ ompd_rc_t ompd_device_initialize( for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) { uint64_t cuda_ctx; - // TODO: (mr) think of a better way to cast contexts ret = TValue(process_handle->context, "ompd_CudaContextArray"). cast("ompd_cuda_context_ptr_t",1). getArrayElement(i). @@ -1024,6 +1021,8 @@ ompd_get_thread_handle(ompd_address_space_handle_t (*thread_handle)->cuda_kernel_info->cudaContext = p->cudaContext; (*thread_handle)->cuda_kernel_info->warpSize = p->warpSize; (*thread_handle)->cuda_kernel_info->gridId = p->gridId; + (*thread_handle)->cuda_kernel_info->gridDim = p->gridDim; + (*thread_handle)->cuda_kernel_info->blockDim = p->blockDim; } else { ret = TValue(context, tcontext, "__kmp_gtid") .castBase("__kmp_gtid") @@ -1124,7 +1123,9 @@ ompd_rc_t ompd_get_thread_id( cuda_thread_id->blockIdx.y = cuda_thread_id->blockIdx.z = 0; - // TODO (mr) add gridDim and blockDim + cuda_thread_id->gridDim = thread_handle->cuda_kernel_info->gridDim; + cuda_thread_id->blockDim = thread_handle->cuda_kernel_info->blockDim; + return ompd_rc_ok; } else { ompd_size_t size; @@ -1171,7 +1172,7 @@ ompd_rc_t ompd_get_state( if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { if (wait_id) - *wait_id = 0; //TODO: (mr) implement wait_ids in nvptx device rtl + *wait_id = 0; ret = TValue(context, thread_handle->th) .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED) .access("ompd_thread_info") diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index e803012c4..cd9c5ead7 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -31,6 +31,7 @@ extern "C" { #define STR(x) STR_HELPER(x) #include "ompd.h" +#include "ompd-types.h" /****************************************************************************** * General helper functions @@ -51,6 +52,8 @@ typedef struct _ompd_cuda_thread_kernel_info_s { ompd_addr_t cudaContext; ompd_addr_t warpSize; ompd_addr_t gridId; + ompd_dim3_t gridDim; + ompd_dim3_t blockDim; } ompd_cuda_thread_kernel_info_t; typedef struct _ompd_address_space_context_s ompd_address_space_context_t; From c1b4c5a79122ea3ba06f5ab6f4fa6433fa2283d8 Mon Sep 17 00:00:00 2001 From: "protze@itc.rwth-aachen.de" Date: Fri, 12 Oct 2018 19:17:20 +0200 Subject: [PATCH 59/64] Add information on scheduling parent for the master task in a parallel region --- runtime/src/kmp_runtime.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp index 896d431a1..e611f8ff7 100644 --- a/runtime/src/kmp_runtime.cpp +++ b/runtime/src/kmp_runtime.cpp @@ -1402,6 +1402,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { /* OMPT state */ this_thr->th.ompt_thread_info.state = omp_state_work_parallel; OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1); + OMPT_CUR_TASK_INFO(this_thr)->scheduling_parent = this_thr->th.th_current_task->td_parent; } #endif } @@ -1565,6 +1566,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, OMPT_CUR_TASK_INFO(master_th) ->thread_num = __kmp_tid_from_gtid(gtid); } + OMPT_CUR_TASK_INFO(master_th)->scheduling_parent = master_th->th.th_current_task->td_parent; /* OMPT state */ master_th->th.ompt_thread_info.state = omp_state_work_parallel; @@ -1786,6 +1788,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, OMPT_CUR_TASK_INFO(master_th) ->thread_num = __kmp_tid_from_gtid(gtid); } + OMPT_CUR_TASK_INFO(master_th)->scheduling_parent = master_th->th.th_current_task->td_parent; /* OMPT state */ master_th->th.ompt_thread_info.state = omp_state_work_parallel; @@ -1891,6 +1894,7 @@ int __kmp_fork_call(ident_t *loc, int gtid, OMPT_CUR_TASK_INFO(master_th) ->thread_num = __kmp_tid_from_gtid(gtid); } + OMPT_CUR_TASK_INFO(master_th)->scheduling_parent = master_th->th.th_current_task->td_parent; /* OMPT state */ master_th->th.ompt_thread_info.state = omp_state_work_parallel; From 66b0f8db22db06e8342c9a6859b6e1fa7d83eb6c Mon Sep 17 00:00:00 2001 From: Nina Loeseke Date: Wed, 17 Oct 2018 10:13:31 +0200 Subject: [PATCH 60/64] Initialize all address variables --- libompd/src/TargetValue.cpp | 6 +++++- libompd/src/omp-debug.cpp | 28 ++++++++++++++++------------ libompd/src/omp-debug.h | 8 ++++++++ runtime/src/ompt-general.cpp | 2 +- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index 109c8fa01..d1de522fd 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -362,7 +362,11 @@ TBaseValue TValue::castBase(const char *varName) { return TBaseValue(*this, size); } -TBaseValue TValue::castBase() const { return TBaseValue(*this, fieldSize); } +TBaseValue TValue::castBase() const { + if(pointerLevel>0) + return TBaseValue(*this, type_sizes.sizeof_pointer); + return TBaseValue(*this, fieldSize); +} TBaseValue TValue::castBase(ompd_target_prim_types_t baseType) const { return TBaseValue(*this, baseType); diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 970e197ae..42c93d5b1 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -190,7 +190,7 @@ ompd_rc_t ompd_get_thread_in_parallel( assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { uint16_t thread_idx; @@ -375,7 +375,7 @@ ompd_rc_t ompd_get_current_parallel_handle( (*parallel_handle)->th = taddr; (*parallel_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info; } else { - ompd_address_t taddr, lwt; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ .cast("kmp_base_info_t") @@ -423,7 +423,7 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = parallel_handle->th, lwt; + ompd_address_t taddr = parallel_handle->th, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret; if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { @@ -558,7 +558,7 @@ ompd_rc_t ompd_get_task_parallel_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret; @@ -679,7 +679,7 @@ ompd_rc_t ompd_get_current_task_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret = ompd_rc_ok; lwt.segment = OMPD_SEGMENT_UNSPECIFIED; @@ -740,7 +740,7 @@ ompd_rc_t ompd_get_generating_task_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = task_handle->th, lwt; + ompd_address_t taddr = task_handle->th, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret = ompd_rc_stale_handle; TValue lwtValue = TValue(context, task_handle->lwt); @@ -800,7 +800,7 @@ ompd_rc_t ompd_get_scheduling_task_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret; if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { @@ -823,8 +823,11 @@ ompd_rc_t ompd_get_scheduling_task_handle( .cast("ompt_task_info_t") .access("scheduling_parent") // td->ompd_task_info.scheduling_parent .cast("kmp_taskdata_t", 1) - .dereference() - .getAddress(&taddr); + .castBase() + .getValue(taddr.address); + if (taddr.address == 0) { + return ompd_rc_unavailable; + } } if (ret != ompd_rc_ok) @@ -835,6 +838,7 @@ ompd_rc_t ompd_get_scheduling_task_handle( return ret; (*parent_task_handle)->th = taddr; + (*parent_task_handle)->lwt = {OMPD_SEGMENT_UNSPECIFIED,0}; (*parent_task_handle)->ah = task_handle->ah; (*parent_task_handle)->cuda_kernel_info = task_handle->cuda_kernel_info; return ret; @@ -857,7 +861,7 @@ ompd_rc_t ompd_get_task_in_parallel( assert(callbacks && "Callback table not initialized!"); ompd_rc_t ret; - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { ret = TValue(context, parallel_handle->th) @@ -961,7 +965,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t .getPtrArrayElement(p->threadIdx.x) .dereference(); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; ret = th.getAddress(&taddr); if (ret != ompd_rc_ok) { @@ -1038,7 +1042,7 @@ ompd_get_thread_handle(ompd_address_space_handle_t .getArrayElement(tId) /*__kmp_threads[t]*/ .access("th"); /*__kmp_threads[t]->th*/ - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; ret = th.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index cd9c5ead7..81b652dab 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -89,6 +89,14 @@ typedef struct _ompd_task_handle_s { used to retrieve this parallel region handle */ + _ompd_task_handle_s(){ + ah=NULL; + th.segment=OMPD_SEGMENT_UNSPECIFIED; + lwt.segment=OMPD_SEGMENT_UNSPECIFIED; + th.address=0; + lwt.address=0; + cuda_kernel_info=NULL; + } } ompd_task_handle_t; #endif diff --git a/runtime/src/ompt-general.cpp b/runtime/src/ompt-general.cpp index 1ef082082..9de376a47 100644 --- a/runtime/src/ompt-general.cpp +++ b/runtime/src/ompt-general.cpp @@ -371,7 +371,7 @@ void ompt_post_init() { } void ompt_fini() { - if (ompt_enabled.enabled) { + if (ompt_enabled.enabled && ompt_start_tool_result && ompt_start_tool_result->finalize) { ompt_start_tool_result->finalize(&(ompt_start_tool_result->tool_data)); } From 726b21b6b7d2fa16991ed3b1d7ae5dc8cbea0a90 Mon Sep 17 00:00:00 2001 From: Joachim Protze Date: Fri, 14 Dec 2018 12:32:09 +0100 Subject: [PATCH 61/64] Fix ompd_get_thread_id --- libompd/src/omp-debug.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 42c93d5b1..dcaac27d5 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -1091,7 +1091,7 @@ ompd_rc_t ompd_get_thread_id( return ompd_rc_stale_handle; ompd_rc_t ret; - if (kind != OMPD_THREAD_ID_CUDALOGICAL) { + if (kind == OMPD_THREAD_ID_CUDALOGICAL) { if (sizeof_thread_id != sizeof(ompd_cudathread_coord_t)) { return ompd_rc_bad_input; } From 36fcaf349946cf787daa376aceee0e3cc262d7ec Mon Sep 17 00:00:00 2001 From: Nina Loeseke Date: Wed, 16 Jan 2019 10:28:04 +0100 Subject: [PATCH 62/64] Initialize lwt field for ompd_get_task_in_parallel --- libompd/src/omp-debug.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index dcaac27d5..32832bc52 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -891,6 +891,7 @@ ompd_rc_t ompd_get_task_in_parallel( (*task_handle)->th = taddr; (*task_handle)->ah = parallel_handle->ah; + (*task_handle)->lwt = {OMPD_SEGMENT_UNSPECIFIED,0}; (*task_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info; return ret; } From e1c6cad0b952fedbd388a7cb70492a8b67ad3f83 Mon Sep 17 00:00:00 2001 From: Joachim Protze Date: Thu, 17 Jan 2019 10:03:10 +0100 Subject: [PATCH 63/64] Fix ompd_get_task_function for implicit tasks --- libompd/src/omp-debug.cpp | 46 +++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 32832bc52..4dd55fd2f 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -1296,13 +1296,45 @@ ompd_rc_t ompd_get_task_function( if(task_handle->lwt.address!=0) return ompd_rc_bad_input; // We need to decide what we do here. else - ret = TValue(context, task_handle->th). - cast("kmp_taskdata_t",0). /*t*/ - getArrayElement(1). /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ - cast("kmp_task_t",0). /* (kmp_task_t *) */ - access("routine"). /*td->ompt_task_info*/ - castBase(). - getValue(task_addr->address); + { + ompd_word_t val; + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_flags") // td->td_flags + .cast("kmp_tasking_flags_t") + .check("tasktype", &val); // td->td_flags.tasktype + + if (ret != ompd_rc_ok) + return ret; + + if (val==1) { // tasktype: explicit = 1, implicit = 0 + + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t",0) /*t*/ + .getArrayElement(1) /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ + .cast("kmp_task_t",0) /* (kmp_task_t *) */ + .access("routine") /*td->ompt_task_info*/ + .castBase() + .getValue(task_addr->address); + + } else { + + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .cast("kmp_base_team_t", 0) + .access("t_parent") /*td.td_team->t.t_parent*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t.t_parent->t*/ + .cast("kmp_base_team_t", 0) + .access("t_pkfn") /*td.td_team->t.t_parent->t.t_pkfn*/ + .castBase() + .getValue(task_addr->address); + + } + } } return ret; } From f9bb73921cc096bde4d43e6068a47ac44c2489db Mon Sep 17 00:00:00 2001 From: Nina Loeseke Date: Thu, 17 Jan 2019 14:23:32 +0100 Subject: [PATCH 64/64] Use the right team to get the function pointer --- libompd/src/omp-debug.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 4dd55fd2f..fae2b63cc 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -1325,11 +1325,7 @@ ompd_rc_t ompd_get_task_function( .cast("kmp_team_p", 1) .access("t") /*td.td_team->t*/ .cast("kmp_base_team_t", 0) - .access("t_parent") /*td.td_team->t.t_parent*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t.t_parent->t*/ - .cast("kmp_base_team_t", 0) - .access("t_pkfn") /*td.td_team->t.t_parent->t.t_pkfn*/ + .access("t_pkfn") /*td.td_team->t.t_pkfn*/ .castBase() .getValue(task_addr->address);