diff --git a/CMakeLists.txt b/CMakeLists.txt index 14fc4d680..6e1ad87ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,8 @@ if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_S "Enable -Werror flags to turn warnings into errors for supporting compilers.") set(OPENMP_LIBDIR_SUFFIX "" CACHE STRING "Suffix of lib installation directory, e.g. 64 => lib64") + # Do not use OPENMP_LIBDIR_SUFFIX directly, use OPENMP_INSTALL_LIBDIR. + set(OPENMP_INSTALL_LIBDIR "lib${OPENMP_LIBDIR_SUFFIX}") # Group test settings. set(OPENMP_TEST_C_COMPILER ${CMAKE_C_COMPILER} CACHE STRING @@ -28,7 +30,7 @@ if (OPENMP_STANDALONE_BUILD OR "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_S else() set(OPENMP_ENABLE_WERROR ${LLVM_ENABLE_WERROR}) # If building in tree, we honor the same install suffix LLVM uses. - set(OPENMP_LIBDIR_SUFFIX ${LLVM_LIBDIR_SUFFIX}) + set(OPENMP_INSTALL_LIBDIR "lib${LLVM_LIBDIR_SUFFIX}") if (NOT MSVC) set(OPENMP_TEST_C_COMPILER ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang) diff --git a/README.rst b/README.rst index ea79f1948..9fb34dfc2 100644 --- a/README.rst +++ b/README.rst @@ -257,9 +257,11 @@ Options for ``libomptarget`` Options for ``NVPTX device RTL`` -------------------------------- -**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``OFF|ON`` +**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``ON|OFF`` Enable CUDA LLVM bitcode offloading device RTL. This is used for link time - optimization of the OMP runtime and application code. + optimization of the OMP runtime and application code. This option is enabled + by default if the build system determines that `CMAKE_C_COMPILER` is able to + compile and link the library. **LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""`` Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only diff --git a/libompd/cuda_examples/test_target_generic.c b/libompd/cuda_examples/test_target_generic.c new file mode 100644 index 000000000..db881da5e --- /dev/null +++ b/libompd/cuda_examples/test_target_generic.c @@ -0,0 +1,33 @@ +// Testing generic mode of nvptx devRtl +#include + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + //init(v1, v2, N); + #pragma omp target map(v1, v2, p) + { + test_breakpoint(); + #pragma omp parallel for + for (i=0; i +#include + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + omp_set_nested(1); + #pragma omp target map(v1, v2, p) + { + omp_set_nested(1); + #pragma omp parallel shared(v1, v2, p, N) num_threads(4) + { + printf("Outer region - thread ID: %d\n", omp_get_thread_num()); + #pragma omp for + for (int i = 0; i < N; ++i) + { + float acc = 0; + #pragma omp parallel shared(v1, v2, p, N) num_threads(4) + #pragma omp for + for(int j = 0; j < N; ++j) + { + test_breakpoint(); + p[i] += v1[i] + v2[i]; + } + } + } + printf("End of target region\n"); + } +//output(p, N); +} +int main() { + printf("calling vec_mul...\n"); + vec_mult(64); + printf("done\n"); + return 0; +} diff --git a/libompd/cuda_examples/test_target_noparallel.c b/libompd/cuda_examples/test_target_noparallel.c new file mode 100644 index 000000000..2e2f2f51c --- /dev/null +++ b/libompd/cuda_examples/test_target_noparallel.c @@ -0,0 +1,24 @@ +#include + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + #pragma omp target map(v1, v2, p) + { + test_breakpoint(); + p[0] = v[0] * v[0]; + } +} +int main() { + printf("calling vec_mul...\n"); + vec_mult(64); + printf("done\n"); + return 0; +} diff --git a/libompd/cuda_examples/test_target_single.c b/libompd/cuda_examples/test_target_single.c new file mode 100644 index 000000000..4a2bc3260 --- /dev/null +++ b/libompd/cuda_examples/test_target_single.c @@ -0,0 +1,29 @@ +#include + +#pragma omp declare target +float mult(float u, float v) { + return u * v; +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + //init(v1, v2, N); + #pragma omp target map(v1, v2, p) + { + #pragma omp parallel for + for (i=0; i + +#pragma omp declare target +void test_breakpoint() { + asm(""); +} +#pragma omp end declare target + +void vec_mult(int N) +{ + int i; + float p[N], v1[N], v2[N]; + //init(v1, v2, N); + #pragma omp target map(v1, v2, p) + { + #pragma omp parallel for + for (i=0; i +#include + +#pragma omp declare target +void task1() { + printf("Hello from Task 1\n"); + uint32_t enter_frame = 0; + for(;1;) { + } +} +void task2() { + printf("Hello from Task 2\n"); + for(;1;) { + } +} +#pragma omp end declare target + +int main() { + #pragma omp target + { + #pragma omp parallel num_threads(4) + { + #pragma omp single + { + #pragma omp task + task1(); + #pragma omp task + task2(); + } + } + } + return 0; +} diff --git a/libompd/gdb-wrapper/CMakeLists.txt b/libompd/gdb-wrapper/CMakeLists.txt index c3ea2824c..ec87ef31b 100644 --- a/libompd/gdb-wrapper/CMakeLists.txt +++ b/libompd/gdb-wrapper/CMakeLists.txt @@ -1,5 +1,9 @@ project (odb) +cmake_minimum_required(VERSION 2.8) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") + set (cppfiles InputOutputManager.cpp ChildProcess.cpp @@ -30,14 +34,12 @@ add_executable (odb-bin ${cppfiles} odb.cpp) set_target_properties (odb-bin PROPERTIES OUTPUT_NAME odb) add_library (odb ${cppfiles}) -if (ODB_LINUX) -target_link_libraries (odb-bin dl) -target_link_libraries (odb dl) -endif (ODB_LINUX) +target_link_libraries (odb-bin dl) +target_link_libraries (odb dl) include_directories ( ${CMAKE_CURRENT_SOURCE_DIR} -# ${CMAKE_CURRENT_SOURCE_DIR}/../src/ + ${CMAKE_CURRENT_SOURCE_DIR}/../src/ ${CMAKE_BINARY_DIR}/include ) diff --git a/libompd/gdb-wrapper/Callbacks.cpp b/libompd/gdb-wrapper/Callbacks.cpp index e15e7e795..77c91ec1b 100644 --- a/libompd/gdb-wrapper/Callbacks.cpp +++ b/libompd/gdb-wrapper/Callbacks.cpp @@ -37,17 +37,16 @@ void initializeCallbacks(const GdbProcessPtr &proc) gdb = proc; // Initialize static table - cb.dmemory_alloc = CB_dmemory_alloc; - cb.dmemory_free = CB_dmemory_free; - cb.print_string = CB_print_string; - cb.get_thread_context_for_osthread = CB_thread_context; - cb.get_containing_process_context = CB_process_context; - cb.tsizeof_prim = CB_tsizeof_prim; - cb.tsymbol_addr = CB_tsymbol_addr; - cb.read_tmemory = CB_read_tmemory; - cb.write_tmemory = CB_write_tmemory; - cb.host_to_target = CB_host_to_target; - cb.target_to_host = CB_target_to_host; + cb.memory_alloc = CB_dmemory_alloc; + cb.memory_free = CB_dmemory_free; + cb.print_string = CB_print_string; + cb.get_thread_context_for_thread_id = CB_thread_context; + cb.sizeof_types = CB_tsizeof_prim; + cb.symbol_addr_lookup = CB_tsymbol_addr; + cb.read_memory = CB_read_tmemory; + cb.write_memory = CB_write_tmemory; + cb.host_to_device = CB_host_to_target; + cb.device_to_host = CB_target_to_host; } ompd_callbacks_t * getCallbacksTable() @@ -78,14 +77,14 @@ ompd_rc_t CB_dmemory_free ( ompd_rc_t CB_thread_context ( ompd_address_space_context_t *context, - ompd_osthread_kind_t kind, + ompd_thread_id_t kind, ompd_size_t sizeof_osthread, const void* osthread, ompd_thread_context_t **tcontext ) { ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle; - if (kind == ompd_osthread_cudalogical) { + if (kind == OMPD_THREAD_ID_CUDALOGICAL) { *tcontext = ((OMPDContext*)context)->getContextForThread((CudaThread*)osthread); } else { @@ -126,7 +125,7 @@ void init_sizes(){ ompd_rc_t CB_tsizeof_prim( ompd_address_space_context_t *context, - ompd_target_type_sizes_t *sizes) + ompd_device_type_sizes_t *sizes) { ompd_rc_t ret = context ? ompd_rc_ok : ompd_rc_stale_handle; static int inited = 0; @@ -135,7 +134,12 @@ ompd_rc_t CB_tsizeof_prim( inited=1; init_sizes(); } - memcpy(sizes, prim_sizes, sizeof(prim_sizes[0])*ompd_type_max); + sizes->sizeof_char = prim_sizes[ompd_type_char]; + sizes->sizeof_short = prim_sizes[ompd_type_short]; + sizes->sizeof_int = prim_sizes[ompd_type_int]; + sizes->sizeof_long = prim_sizes[ompd_type_long]; + sizes->sizeof_long_long = prim_sizes[ompd_type_long_long]; + sizes->sizeof_pointer = prim_sizes[ompd_type_pointer]; return ret; } @@ -175,7 +179,7 @@ ompd_rc_t CB_tsymbol_addr( parser.matchAddressValue(gdb->readOutput().c_str(), addr); if (strlen(addr) > 0) - symbol_addr->address = (ompd_taddr_t) strtoull (addr, NULL, 0); + symbol_addr->address = (ompd_addr_t) strtoull (addr, NULL, 0); else if (strlen(addr) == 0) ret = ompd_rc_error; @@ -267,7 +271,7 @@ ompd_rc_t CB_write_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, const void *buffer) { return ompd_rc_unsupported; @@ -277,7 +281,7 @@ ompd_rc_t CB_read_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, void *buffer) { if (!context) diff --git a/libompd/gdb-wrapper/Callbacks.h b/libompd/gdb-wrapper/Callbacks.h index d93c74580..3e8f379be 100644 --- a/libompd/gdb-wrapper/Callbacks.h +++ b/libompd/gdb-wrapper/Callbacks.h @@ -48,7 +48,7 @@ ompd_rc_t CB_dmemory_free ( ompd_rc_t CB_thread_context ( ompd_address_space_context_t *context, - ompd_osthread_kind_t kind, + ompd_thread_id_t kind, ompd_size_t sizeof_osthread, const void* osthread, ompd_thread_context_t **tcontext); @@ -59,7 +59,7 @@ ompd_rc_t CB_process_context ( ompd_rc_t CB_tsizeof_prim ( ompd_address_space_context_t *context, - ompd_target_type_sizes_t *sizes); + ompd_device_type_sizes_t *sizes); ompd_rc_t CB_tsymbol_addr ( ompd_address_space_context_t *context, @@ -71,7 +71,7 @@ ompd_rc_t CB_read_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, const ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, void *buffer ); @@ -79,7 +79,7 @@ ompd_rc_t CB_write_tmemory ( ompd_address_space_context_t *context, ompd_thread_context_t *tcontext, const ompd_address_t addr, - ompd_tword_t nbytes, + ompd_word_t nbytes, const void *buffer ); diff --git a/libompd/gdb-wrapper/CudaGdb.h b/libompd/gdb-wrapper/CudaGdb.h index b690257b6..0408668b0 100644 --- a/libompd/gdb-wrapper/CudaGdb.h +++ b/libompd/gdb-wrapper/CudaGdb.h @@ -13,6 +13,7 @@ #include #include #include "ompd.h" +#include "../src/ompd-private.h" struct CudaThread { ompd_cudathread_coord_t coord; diff --git a/libompd/gdb-wrapper/OMPDCommand.cpp b/libompd/gdb-wrapper/OMPDCommand.cpp index b1a82e67f..b34b72396 100644 --- a/libompd/gdb-wrapper/OMPDCommand.cpp +++ b/libompd/gdb-wrapper/OMPDCommand.cpp @@ -5,14 +5,16 @@ * Author: Ignacio Laguna * Contact: ilaguna@llnl.gov */ -#include +//#include #include "OMPDCommand.h" #include "OMPDContext.h" #include "Callbacks.h" #include "OutputString.h" #include "Debug.h" +#include "omp.h" #include "ompd.h" -#include "ompd_test.h" +//#include "ompd_test.h" +#define ODB_LINUX #include "CudaGdb.h" #include @@ -23,24 +25,60 @@ using namespace ompd_gdb; using namespace std; -const char * ompd_state_names[256]; extern OMPDHostContextPool * host_contextPool; +/* --- OMPDIcvs ------------------------------------------------------------- */ + +OMPDIcvs::OMPDIcvs(OMPDFunctionsPtr functions, + ompd_address_space_handle_t *addrhandle) + : functions(functions) { + ompd_icv_id_t next_icv_id = ompd_icv_undefined; + int more = 1; + const char *next_icv_name_str; + ompd_scope_t next_scope; + ompd_rc_t ret = ompd_rc_ok; + while (more && ret == ompd_rc_ok) { + ret = functions->ompd_enumerate_icvs(addrhandle, + next_icv_id, + &next_icv_id, + &next_icv_name_str, + &next_scope, + &more); + if (ret == ompd_rc_ok) { + availableIcvs[next_icv_name_str] = + std::pair(next_icv_id, next_scope); + } + } +} + + +ompd_rc_t OMPDIcvs::get(ompd_parallel_handle_t *handle, const char *name, + ompd_word_t *value) { + ompd_icv_id_t icv; + ompd_scope_t scope; + + auto &p = availableIcvs.at(name); + icv = p.first; + scope = p.second; + + if (scope != ompd_scope_parallel) { + return ompd_rc_bad_input; + } + + return functions->ompd_get_icv_from_scope((void *)handle, scope, icv, value); +} + /* --- OMPDCommandFactory --------------------------------------------------- */ OMPDCommandFactory::OMPDCommandFactory() { functions = OMPDFunctionsPtr(new OMPDFunctions); -#define ompd_state_macro(state, code) ompd_state_names[code] = #state; - FOREACH_OMPD_STATE(ompd_state_macro) -#undef ompd_state_macro - // Load OMPD DLL and get a handle #ifdef ODB_LINUX - functions->ompdLibHandle = dlopen("libompd_intel.so", RTLD_LAZY); + functions->ompdLibHandle = dlopen("libompd.so", RTLD_LAZY); #elif defined(ODB_MACOS) - functions->ompdLibHandle = dlopen("libompd_intel.dylib", RTLD_LAZY); + functions->ompdLibHandle = dlopen("libompd.dylib", RTLD_LAZY); #else #error Unsupported platform! #endif @@ -67,16 +105,24 @@ OMPDCommandFactory::OMPDCommandFactory() FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) #undef OMPD_FIND_API_FUNCTION +} - //functions->test_CB_tsizeof_prim = - // (void (*)()) findFunctionInLibrary("test_CB_tsizeof_prim"); - //functions->test_CB_dmemory_alloc = - // (void (*)()) findFunctionInLibrary("test_CB_dmemory_alloc"); +OMPDCommandFactory::~OMPDCommandFactory() +{ + ompd_rc_t ret; + ret = functions->ompd_release_address_space_handle(addrhandle); + if (ret != ompd_rc_ok) + { + out << "ERROR: could not finalize target address space\n"; + } +} +void OMPDCommandFactory::initOmpd() +{ // Initialize OMPD library ompd_callbacks_t *table = getCallbacksTable(); assert(table && "Invalid callbacks table"); - ompd_rc_t ret = functions->ompd_initialize(table); + ompd_rc_t ret = functions->ompd_initialize(0, table); if (ret != ompd_rc_ok) { out << "ERROR: could not initialize OMPD\n"; @@ -86,22 +132,12 @@ FOREACH_OMPD_API_FN(OMPD_FIND_API_FUNCTION) /*&prochandle, */&addrhandle); if (ret != ompd_rc_ok) { + addrhandle = nullptr; out << "ERROR: could not initialize target process\n"; } -} - -OMPDCommandFactory::~OMPDCommandFactory() -{ - ompd_rc_t ret; -// ret = functions->ompd_process_finalize(prochandle); -// if (ret != ompd_rc_ok) -// { -// out << "ERROR: could not finalize target process\n"; -// } - ret = functions->ompd_release_address_space_handle(addrhandle); - if (ret != ompd_rc_ok) + else { - out << "ERROR: could not finalize target address space\n"; + icvs = OMPDIcvsPtr(new OMPDIcvs(functions, addrhandle)); } } @@ -122,21 +158,28 @@ void * OMPDCommandFactory::findFunctionInLibrary(const char *fun) const return ret; } -OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& extraArgs) const +OMPDCommand* OMPDCommandFactory::create(const char *str, const vector& extraArgs) { + if (addrhandle == nullptr) { + initOmpd(); + } + if (strcmp(str, "test") == 0) return new OMPDTestCallbacks(functions, addrhandle, extraArgs); else if (strcmp(str, "threads") == 0) return new OMPDThreads(functions, addrhandle, extraArgs); else if (strcmp(str, "levels") == 0) - return new OMPDLevels(functions, addrhandle, extraArgs); + return new OMPDLevels(functions, addrhandle, icvs, extraArgs); else if (strcmp(str, "callback") == 0) return new OMPDCallback(functions, addrhandle, extraArgs); else if (strcmp(str, "api") == 0) return new OMPDApi(functions, addrhandle, extraArgs); else if (strcmp(str, "testapi") == 0) - return new OMPDTest(functions, addrhandle, extraArgs); - + return new OMPDTest(functions, addrhandle, icvs, extraArgs); + else if (strcmp(str, "parallel") == 0) + return new OMPDParallelRegions(functions, addrhandle, icvs, extraArgs); + else if (strcmp(str, "tasks") == 0) + return new OMPDTasks(functions, addrhandle, icvs, extraArgs); return new OMPDNull; } @@ -183,6 +226,17 @@ const char* OMPDTestCallbacks::toString() const void OMPDThreads::execute() const { + // get state names + map host_state_names; + ompd_word_t more_states = 1; + ompd_word_t next_state = omp_state_undefined; + host_state_names[next_state] = "ompd_state_undefined"; + while (more_states) { + const char *state_name; + functions->ompd_enumerate_states(addrhandle, next_state, &next_state, &state_name, &more_states); + host_state_names[next_state] = state_name; + } + printf("\nHOST THREADS\n"); printf("Debugger_handle Thread_handle System_thread\n"); printf("--------------------------------------------------\n"); @@ -191,15 +245,16 @@ void OMPDThreads::execute() const for(auto i: thread_ids) { ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( - addrhandle, ompd_osthread_pthread, sizeof(i.second), + addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second), &(i.second), &thread_handle); if (ret == ompd_rc_ok) { - ompd_state_t state; + ompd_word_t state; ompd_wait_id_t wait_id; ret = functions->ompd_get_state(thread_handle, &state, &wait_id); - printf(" %-12u %p 0x%lx\t%i\t%lx\n", - (unsigned int)i.first, thread_handle, i.second, state, wait_id); + printf(" %-12u %p 0x%lx\t%s\t%lx\n", + (unsigned int)i.first, thread_handle, i.second, host_state_names[state], wait_id); + functions->ompd_release_thread_handle(thread_handle); } else { @@ -211,7 +266,20 @@ void OMPDThreads::execute() const int omp_cuda_threads = 0; vector cuda_ContextPools; map device_initialized; - map address_spaces; + map address_spaces; + ompd_word_t last_state = -1; + ompd_cudathread_coord_t last_coords; + vector device_thread_handles; + + // get cuda states + map cuda_state_names; + more_states = 1; + next_state = omp_state_undefined; + cuda_state_names[next_state] = "omp_state_undefined"; + + printf("\nCUDA THREADS\n"); + printf("Cuda block from Thread to Thread state\n"); + printf("------------------------------------------\n"); for(auto i: cuda.threads) { if (!device_initialized[i.coord.cudaContext]) { @@ -221,26 +289,76 @@ void OMPDThreads::execute() const device_initialized[i.coord.cudaContext] = true; result = functions->ompd_device_initialize( - cpool->getGlobalOmpdContext(), - i.coord.cudaContext, - ompd_device_kind_cuda, + addrhandle, + cpool->getGlobalOmpdContext(), + OMPD_DEVICE_KIND_CUDA, + sizeof(i.coord.cudaContext), + &i.coord.cudaContext, &cpool->ompd_device_handle); if (result != ompd_rc_ok) + { continue; + } address_spaces[i.coord.cudaContext] = cpool->ompd_device_handle; + while (more_states) { + const char *state_name; + functions->ompd_enumerate_states(cpool->ompd_device_handle, + next_state, &next_state, + &state_name, &more_states); + cuda_state_names[next_state] = state_name; + } } ompd_thread_handle_t* thread_handle; ompd_rc_t ret = functions->ompd_get_thread_handle( address_spaces[i.coord.cudaContext], - ompd_osthread_cudalogical, + OMPD_THREAD_ID_CUDALOGICAL, sizeof(i.coord), &i.coord, &thread_handle); if (ret == ompd_rc_ok) + { + ompd_word_t state; + device_thread_handles.push_back(thread_handle); + ret = functions->ompd_get_state(thread_handle, &state, NULL); + if (last_state == -1) { + last_state = state; + last_coords = i.coord; + printf("(%li,0,0) (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x); + } else if (state != last_state || i.coord.blockIdx.x != last_coords.blockIdx.x || i.coord.threadIdx.x != last_coords.threadIdx.x + 1) { + printf(" (%li,0,0) %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]); + last_coords = i.coord; + last_state = state; + printf("(%li,0,0) (%li,0,0)", i.coord.blockIdx.x, i.coord.threadIdx.x); + } else { /* state == last_state*/ + last_coords = i.coord; + } omp_cuda_threads++; + } + } + // Check for non-unique handles + for (auto i: device_thread_handles) { + for (auto j: device_thread_handles) { + int value; + if (i == j) { + continue; + } + ompd_rc_t ret = functions->ompd_thread_handle_compare(i, j, &value); + if (!value) { + printf("FOUND NON-UNIQUE THREAD HANDLES FOR DIFFERENT THREADS\n"); + } + } + } + + // release thread handles + for (auto i: device_thread_handles) { + functions->ompd_release_thread_handle(i); + } + + if (last_state != -1) { + printf(" (%li,0,0) %s\n", last_coords.threadIdx.x, cuda_state_names[last_state]); } if (cuda.threads.size() != 0) { @@ -259,30 +377,28 @@ const char* OMPDThreads::toString() const void OMPDLevels::execute() const { -/* ompd_size_t num_os_threads; - ompd_rc_t ret = CB_num_os_threads(contextPool->getGlobalOmpdContext(), &num_os_threads); - assert(ret==ompd_rc_ok && "Error calling OMPD!"); - ompd_osthread_t* osThreads = (ompd_osthread_t*) - malloc(sizeof(ompd_osthread_t)*num_os_threads); - ret = CB_get_os_threads (contextPool->getGlobalOmpdContext(), &num_os_threads, &osThreads); - assert(ret==ompd_rc_ok && "Error calling OMPD!"); - + ompd_rc_t ret; printf("\n"); printf("Thread_handle Nesting_level\n"); printf("-------------------------------\n"); - for (size_t i=0; i < num_os_threads; ++i) + for (auto i: getThreadIDsFromDebugger()) { - ompd_thread_handle_t thread_handle; + ompd_thread_handle_t *thread_handle; + ompd_parallel_handle_t *parallel_handle; ret = functions->ompd_get_thread_handle( - contextPool->getGlobalOmpdContext(), &(osThreads[i]), &thread_handle); + addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second) ,&(i.second), &thread_handle); + if (ret != ompd_rc_ok) { + continue; + } + ret = functions->ompd_get_current_parallel_handle(thread_handle, + ¶llel_handle); if (ret == ompd_rc_ok) { - ompd_tword_t level=0; - ret = functions->ompd_nesting_level( - contextPool->getGlobalOmpdContext(), &thread_handle, &level); - printf("%-12u %ld\n", (unsigned int)thread_handle, level); + ompd_word_t level=0; + icvs->get(parallel_handle, "levels-var", &level); + printf("%-12p %ld\n", thread_handle, level); } - }*/ + } } const char* OMPDLevels::toString() const @@ -293,23 +409,9 @@ const char* OMPDLevels::toString() const /* --- OMPDCallback ----------------------------------------------------------- */ -ompd_target_prim_types_t get_prim_type_from_string(const string& str) -{ - const char * names[ompd_type_max] = { - "CHAR", - "SHORT", - "INT", - "LONG", - "LONG_LONG", - "POINTER" - }; - for (int i = 0; 0\" to get more help on the usage" << endl; return; - } + } /*ompd_rc_t CB_read_tmemory ( ompd_context_t *context, - ompd_taddr_t addr, + ompd_addr_t addr, ompd_tword_t bufsize, void *buffer );*/ @@ -333,7 +435,7 @@ void OMPDCallback::execute() const return; } long long temp=0; - ompd_taddr_t addr = (ompd_taddr_t)strtoll(extraArgs[1].c_str(), NULL, 0); + ompd_addr_t addr = (ompd_addr_t)strtoll(extraArgs[1].c_str(), NULL, 0); int cnt = atoi(extraArgs[2].c_str()); ret = CB_read_tmemory( host_contextPool->getGlobalOmpdContext(), NULL, {0,addr}, cnt, &temp); @@ -345,7 +447,7 @@ void OMPDCallback::execute() const /*ompd_rc_t CB_tsymbol_addr ( ompd_context_t *context, const char *symbol_name, - ompd_taddr_t *symbol_addr);*/ + ompd_addr_t *symbol_addr);*/ if (extraArgs[0] == "tsymbol_addr") { @@ -370,7 +472,7 @@ const char* OMPDCallback ::toString() const } void OMPDApi::execute() const -{ +{ ompd_rc_t ret; if (extraArgs.empty() || extraArgs[0] == "help") @@ -378,7 +480,7 @@ void OMPDApi::execute() const hout << "API functions available: read_tmemory, ttype, ttype_sizeof, ttype_offset, tsymbol_addr" << endl << "Use \"odb api \" to get more help on the usage" << endl; return; - } + } //ompd_rc_t ompd_get_threads ( // ompd_context_t *context, /* IN: debugger handle for the target */ @@ -388,6 +490,7 @@ void OMPDApi::execute() const if (extraArgs[0] == "get_threads") { +#if 0 if(extraArgs.size()>1) { hout << "Usage: odb api get_threads" << endl; @@ -395,8 +498,8 @@ void OMPDApi::execute() const } ompd_thread_handle_t ** thread_handle_array; int num_handles; - - + + ret = functions->ompd_get_threads ( addrhandle, &thread_handle_array, &num_handles); if (ret != ompd_rc_ok) @@ -404,8 +507,10 @@ void OMPDApi::execute() const sout << num_handles << " OpenMP threads:" << endl; for (int i=0; i odbGetThreadHandles(ompd_address_space_handle_t* a { ompd_thread_handle_t* thread_handle; ret = functions->ompd_get_thread_handle( - addrhandle, ompd_osthread_pthread, sizeof(i.second) ,&(i.second), &thread_handle); + addrhandle, OMPD_THREAD_ID_PTHREAD, sizeof(i.second) ,&(i.second), &thread_handle); if (ret!=ompd_rc_ok) continue; thread_handles.push_back(thread_handle); @@ -432,24 +537,74 @@ vector odbGetThreadHandles(ompd_address_space_handle_t* a return thread_handles; } +map odbInitCudaDevices(OMPDFunctionsPtr functions, CudaGdb &cuda, + ompd_address_space_handle_t *addrhandle) +{ + map ret; + map device_initialized; + for (auto i: cuda.threads) { + if (!device_initialized[i.coord.cudaContext]) { + ret.emplace(i.coord.cudaContext, &i); + device_initialized[i.coord.cudaContext] = true; + functions->ompd_device_initialize( + addrhandle, + ret.at(i.coord.cudaContext).getGlobalOmpdContext(), + OMPD_DEVICE_KIND_CUDA, + sizeof(i.coord.cudaContext), + &i.coord.cudaContext, + &ret.at(i.coord.cudaContext).ompd_device_handle); + } + } + return ret; +} + +vector odbGetCudaThreadHandles( + OMPDFunctionsPtr functions, + CudaGdb &cuda, + map &device_handles) +{ + ompd_rc_t ret; + + vector device_thread_handles; + + for(auto i: cuda.threads) { + ompd_thread_handle_t* thread_handle; + ompd_rc_t ret = functions->ompd_get_thread_handle( + device_handles.at(i.coord.cudaContext).ompd_device_handle, + OMPD_THREAD_ID_CUDALOGICAL, + sizeof(i.coord), &i.coord, + &thread_handle); + + if (ret == ompd_rc_ok) + { + device_thread_handles.push_back(thread_handle); + } + } + + return device_thread_handles; +} + vector odbGetParallelRegions(OMPDFunctionsPtr functions, ompd_thread_handle_t* &th) { ompd_rc_t ret; ompd_parallel_handle_t * parallel_handle; vector parallel_handles; - ret = functions->ompd_get_top_parallel_region( - th, ¶llel_handle); + ret = functions->ompd_get_current_parallel_handle( + th, ¶llel_handle); while(ret == ompd_rc_ok) { parallel_handles.push_back(parallel_handle); ret = functions->ompd_get_enclosing_parallel_handle( - parallel_handle, ¶llel_handle); + parallel_handle, ¶llel_handle); } return parallel_handles; } bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector phs) { + sout << "Checking of parallel IDs has been disabled for upgrade of ompd in branch ompd-devices\n"; + return true; +#if 0 bool res=true; // ompd_rc_t ret; int i=0; @@ -466,10 +621,14 @@ bool odbCheckParallelIDs(OMPDFunctionsPtr functions, vector phs) { + sout << "Checking of parallel IDs has been disable for upgrade of ompd in branch ompd-devices\n"; + return true; +#if 0 bool res=true; // ompd_rc_t ret; int i=0; @@ -486,10 +645,14 @@ bool odbCheckParallelNumThreads(OMPDFunctionsPtr functions, vector ths) { + sout << "Checking of task IDs has been disable for upgrade of ompd in branch ompd-devices\n"; + return true; +#if 0 bool res=true; // ompd_rc_t ret; int i=0; @@ -506,20 +669,21 @@ bool odbCheckTaskIDs(OMPDFunctionsPtr functions, vector ths if (ompt_res != ompd_res) res=false; } return res; +#endif } vector odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_thread_handle_t* th) { ompd_rc_t ret; - ompd_task_handle_t * task_handle; + ompd_task_handle_t *task_handle; vector task_handles; - ret = functions->ompd_get_top_task_region( - th, &task_handle); + ret = functions->ompd_get_current_task_handle( + th, &task_handle); while(ret == ompd_rc_ok) { task_handles.push_back(task_handle); - ret = functions->ompd_get_ancestor_task_region( - task_handle, &task_handle); + ret = functions->ompd_get_generating_task_handle( + task_handle, &task_handle); // Is it generating or scheduling task or something different? } return task_handles; } @@ -527,28 +691,76 @@ vector odbGetTaskRegions(OMPDFunctionsPtr functions, ompd_t vector odbGetImplicitTasks(OMPDFunctionsPtr functions, ompd_parallel_handle_t* ph) { // ompd_rc_t ret; - ompd_task_handle_t** task_handles; - int num_tasks; + int num_tasks = evalGdbExpression("call omp_get_num_threads()"); vector return_handles; - /*ret = */functions->ompd_get_implicit_task_in_parallel( - ph, &task_handles, &num_tasks); - for(int i=0; iompd_get_task_in_parallel( + ph, i, &task_handle); + return_handles.push_back(task_handle); } - free(task_handles); return return_handles; } +static bool odbCheckThreadsInParallel(OMPDFunctionsPtr functions, + OMPDIcvsPtr icvs, + ompd_parallel_handle_t *ph, + vector thread_handles) { + ompd_rc_t ret; + bool check_passed = true; + int64_t icv_num_threads; + int64_t icv_level; + + icvs->get(ph, "levels-var", &icv_level); + + ret = icvs->get(ph, "ompd-team-size-var", &icv_num_threads); + if (ret != ompd_rc_ok) { + cout << "Error: could not retrieve icv 'ompd-team-size-var' (" << ret << ")" << endl; + return false; + } + + OMPDThreadHandleCmp thread_cmp_op(functions); + std::set unique_thread_handles(thread_handles.begin(), + thread_handles.end(), + thread_cmp_op); + + sout << "Checking parallel region with level " << icv_level << " and " + << icv_num_threads << " threads (overall " << unique_thread_handles.size() + << " associated threads)" << endl; + + ompd_thread_handle_t *th; + for(int i = 0; i < icv_num_threads; i++) { + ret = functions->ompd_get_thread_in_parallel(ph, i, &th); + if (ret != ompd_rc_ok) { + cout << "Could not retrieve thread handle " << i << " in parallel (" << ret << ")" << endl; + check_passed = false; + continue; + } + + auto matched_th = unique_thread_handles.find(th); + if (matched_th == unique_thread_handles.end()) { + cout << "Thread handle retrieved with ompd_get_thread_in_parallel doesn't match any thread associated with the parallel region (could already have been matched)" << endl; + check_passed = false; + } else { + sout << "Found matching thread for thread " << i << " in parallel region" << endl; + // we dont want a thread matched twice + unique_thread_handles.erase(matched_th); + } + functions->ompd_release_thread_handle(th); + } + return check_passed; +} + void OMPDTest::execute() const -{ +{ // ompd_rc_t ret; if (extraArgs.empty() || extraArgs[0] == "help") { hout << "Test suites available: threads, parallel, tasks" << endl; return; - } + } if (extraArgs[0] == "threads") { @@ -564,7 +776,7 @@ void OMPDTest::execute() const { auto parallel_h = odbGetParallelRegions(functions, thr_h); auto task_h = odbGetTaskRegions(functions, thr_h); - + sout << "Thread handle: 0x" << hex << thr_h << endl << "Parallel: "; for(auto ph: parallel_h) { @@ -573,10 +785,12 @@ void OMPDTest::execute() const auto implicit_task_h = odbGetImplicitTasks(functions, ph); for(auto ith: implicit_task_h) { +#if 0 //MARKER_MR: TODO: fix this uint64_t tid; functions->ompd_get_task_id( ith, &tid); - sout << "0x" << hex << ith << " (" << tid << "), "; +#endif + sout << "0x" << hex << ith << " (" << "DISABLED IN ompd-devices" << "), "; functions->ompd_release_task_handle(ith); } sout << endl; @@ -587,7 +801,7 @@ void OMPDTest::execute() const } sout << endl; pthread_t osthread; - functions->ompd_get_osthread(thr_h, ompd_osthread_pthread, sizeof(pthread_t), &osthread); + functions->ompd_get_thread_id(thr_h, OMPD_THREAD_ID_PTHREAD, sizeof(pthread_t), &osthread); host_contextPool->getThreadContext(&osthread)->setThisGdbContext(); odbCheckParallelIDs(functions, parallel_h); odbCheckTaskIDs(functions, task_h); @@ -598,11 +812,271 @@ void OMPDTest::execute() const functions->ompd_release_thread_handle(thr_h); } } + else if (extraArgs[0] == "parallel-threads") + { + // Checks if the thread handles returned by ompd_get_thread_in_parallel make sense + if (extraArgs.size() > 1) { + hout << "Usage: odb testapi parallel-threads" << endl; + return; + } + + // Check host parallel regions + auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); + + OMPDParallelHandleCmp parallel_cmp_op(functions); + std::map, + OMPDParallelHandleCmp> host_parallel_handles(parallel_cmp_op); + for (auto t: host_thread_handles) { + for (auto parallel_handle: odbGetParallelRegions(functions, t)) + { + host_parallel_handles[parallel_handle].push_back(t); + } + } + + bool host_check_passed = true; + for (auto &ph_threads: host_parallel_handles) { + if (!odbCheckThreadsInParallel(functions, icvs, ph_threads.first, ph_threads.second)) { + host_check_passed = false; + } + } + + cout << "Host check passed: " << host_check_passed << "\n" << endl; + + for (auto ph: host_parallel_handles) { + functions->ompd_release_parallel_handle(ph.first); + } + + for (auto th: host_thread_handles) { + functions->ompd_release_thread_handle(th); + } + // + // For Cuda devices + // + CudaGdb cuda; + auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle); + auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles); + std::map, + OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op); + for (auto t: cuda_thread_handles) { + for (auto p: odbGetParallelRegions(functions, t)) { + cuda_parallel_handles[p].push_back(t); + } + } + + // For instantiation, it doesnt matter which device handle we use for + // OMPDIcvs, just use the first one + + auto cudaIcvs = OMPDIcvsPtr(new OMPDIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle)); + + bool cuda_check_passed = true; + for (auto ph_threads: cuda_parallel_handles) { + if (!odbCheckThreadsInParallel(functions, cudaIcvs, ph_threads.first, ph_threads.second)) { + cuda_check_passed = false; + } + } + cout << "Cuda check passed: " << cuda_check_passed << endl; + return; + } } const char* OMPDTest::toString() const { return "odb api"; } + +void OMPDParallelRegions::execute() const +{ + ompd_rc_t ret; + + // + // For the host runtime + // + auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); + + OMPDParallelHandleCmp parallel_cmp_op(functions); + std::map, + OMPDParallelHandleCmp> host_parallel_handles(parallel_cmp_op); + for (auto t: host_thread_handles) { + for (auto parallel_handle: odbGetParallelRegions(functions, t)) + { + host_parallel_handles[parallel_handle].push_back(t); + } + } + + printf("HOST PARALLEL REGIONS\n"); + printf("Parallel Handle Num Threads ICV Num Threads ICV level ICV active level\n"); + printf("------------------------------------------------------------------------------\n"); + for (auto &p: host_parallel_handles) { + ompd_word_t icv_num_threads, icv_level, icv_active_level; + icvs->get(p.first, "ompd-team-size-var", &icv_num_threads); + icvs->get(p.first, "levels-var", &icv_level); + icvs->get(p.first, "active-levels-var", &icv_active_level); + printf("%-15p %-10zu %-15ld %-9ld %ld\n", p.first, p.second.size(), icv_num_threads, icv_level, icv_active_level); + } + + for (auto t: host_thread_handles) { + functions->ompd_release_thread_handle(t); + } + for (auto &p: host_parallel_handles) { + functions->ompd_release_parallel_handle(p.first); + } + + // + // For Cuda devices + // + CudaGdb cuda; + auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle); + auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles); + std::map, + OMPDParallelHandleCmp> cuda_parallel_handles(parallel_cmp_op); + for (auto t: cuda_thread_handles) { + for (auto p: odbGetParallelRegions(functions, t)) { + cuda_parallel_handles[p].push_back(t); + } + } + + // For instantiation, it doesnt matter which device handle we use for + // OMPDIcvs, just use the first one + + OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle); + + printf("DEVICE PARALLEL REGIONS\n"); + printf("Parallel Handle Num Threads ICV Num Threads ICV level\n"); + printf("------------------------------------------------------------\n"); + for (auto &p: cuda_parallel_handles) { + ompd_word_t icv_level, icv_num_threads; + cudaIcvs.get(p.first, "ompd-team-size-var", &icv_num_threads); + cudaIcvs.get(p.first, "levels-var", &icv_level); + printf("%-15p %-10zu %-14ld %ld\n", p.first, p.second.size(), icv_num_threads, icv_level); + } + + for (auto t: cuda_thread_handles) { + functions->ompd_release_thread_handle(t); + } + for (auto &p: cuda_parallel_handles) { + functions->ompd_release_parallel_handle(p.first); + } + for (auto &d: cuda_device_handles) { + functions->ompd_release_address_space_handle(d.second.ompd_device_handle); + } +} + +const char *OMPDParallelRegions::toString() const +{ + return "odb parallel"; +} + +void OMPDTasks::execute() const +{ + ompd_rc_t ret; + auto host_thread_handles = odbGetThreadHandles(addrhandle, functions); + OMPDTaskHandleCmp task_cmp_op(functions); + std::map, + OMPDTaskHandleCmp> host_task_handles(task_cmp_op); + for (auto t: host_thread_handles) { + for (auto task_handle: odbGetTaskRegions(functions, t)) { + host_task_handles[task_handle].push_back(t); + } + } + + printf("HOST TASKS\n"); + printf("Task Handle Assoc. Threads ICV Level Enter Frame Exit Frame Task function\n"); + printf("-----------------------------------------------------------------------------------\n"); + for (auto th: host_task_handles) { + ompd_parallel_handle_t *ph; + ret = functions->ompd_get_task_parallel_handle(th.first, &ph); + if (ret != ompd_rc_ok) { + printf("could not get parallel handle for nesting\n"); + continue; + } + + ompd_word_t icv_level; + icvs->get(ph, "levels-var", &icv_level); + + ompd_address_t enter_frame; + ompd_address_t exit_frame; + ret = functions->ompd_get_task_frame(th.first, &enter_frame, &exit_frame); + if (ret != ompd_rc_ok) { + printf("could not get task frame\n"); + continue; + } + + ompd_address_t task_function; + ret = functions->ompd_get_task_function(th.first, &task_function); + if (ret != ompd_rc_ok) { + printf("could not get task entry point\n"); + } + printf("%-11p %-14zu %-9ld %-11p %-10p %p\n", th.first, + th.second.size(), icv_level, (void*)enter_frame.address, + (void*)exit_frame.address, (void*)task_function.address); + } + + for (auto task: host_task_handles) { + functions->ompd_release_task_handle(task.first); + } + + for (auto thread: host_thread_handles) { + functions->ompd_release_thread_handle(thread); + } + + // Cuda tasks + CudaGdb cuda; + auto cuda_device_handles = odbInitCudaDevices(functions, cuda, addrhandle); + auto cuda_thread_handles = odbGetCudaThreadHandles(functions, cuda, cuda_device_handles); + std::map, + OMPDTaskHandleCmp> cuda_task_handles(task_cmp_op); + for (auto t: cuda_thread_handles) { + for (auto task_handle: odbGetTaskRegions(functions, t)) { + cuda_task_handles[task_handle].push_back(t); + } + } + + printf("\nCUDA TASKS\n"); + printf("Task Handle Assoc. Threads ICV Level task function\n"); + printf("--------------------------------------------------------\n"); + + // For instantiation, it doesnt matter which device handle we use for + // OMPDIcvs, just use the first one + + OMPDIcvs cudaIcvs(functions, cuda_device_handles.begin()->second.ompd_device_handle); + + for (auto th: cuda_task_handles) { + ompd_parallel_handle_t *ph; + ret = functions->ompd_get_task_parallel_handle(th.first, &ph); + if (ret != ompd_rc_ok) { + printf("could not get parallel handle for nesting\n"); + continue; + } + + ompd_word_t icv_level; + cudaIcvs.get(ph, "levels-var", &icv_level); + + ompd_address_t task_func_addr; + task_func_addr.address = 0; + functions->ompd_get_task_function(th.first, &task_func_addr); + + printf("%-11p %-14zu %-8ld %p\n", th.first, th.second.size(), icv_level, (void*)task_func_addr.address); + functions->ompd_release_parallel_handle(ph); + } + + for (auto task: cuda_task_handles) { + functions->ompd_release_task_handle(task.first); + } + + for (auto thread: cuda_thread_handles) { + functions->ompd_release_thread_handle(thread); + } +} + +const char *OMPDTasks::toString() const +{ + return "odb tasks"; +} diff --git a/libompd/gdb-wrapper/OMPDCommand.h b/libompd/gdb-wrapper/OMPDCommand.h index 04e8bf912..756658a69 100644 --- a/libompd/gdb-wrapper/OMPDCommand.h +++ b/libompd/gdb-wrapper/OMPDCommand.h @@ -28,28 +28,10 @@ #include #include #include +#include #include "ompd.h" #include "ompd_typedefs.h" -#include "ompd_test.h" - - -/* - * The macro is used to create code to register all implemented ompd - * API functions with the CommandFactory - * For new implemented API function just add a new OMPD_DO line - */ - - -#define FOREACH_OMPD_CALLBACK_FN(macro) \ -macro(ompd_dmemory_alloc) \ -macro(ompd_dmemory_free) \ -macro(ompd_tsizeof_prim) \ -macro(ompd_tsymbol_addr) \ -macro(ompd_ttype) \ -macro(ompd_ttype_sizeof) \ -macro(ompd_ttype_offset) \ -macro(ompd_tmemory_access) \ -macro(ompd_print_string) +//#include "ompd_test.h" #define FOREACH_OMPD_API_FN(macro) \ macro(ompd_process_initialize) \ @@ -57,43 +39,28 @@ macro(ompd_device_initialize) \ macro(ompd_release_address_space_handle) \ macro(ompd_initialize) \ macro(ompd_finalize) \ -macro(ompd_get_threads) \ macro(ompd_get_thread_in_parallel) \ macro(ompd_release_thread_handle) \ macro(ompd_thread_handle_compare) \ -macro(ompd_get_top_parallel_region) \ +macro(ompd_get_thread_id) \ +macro(ompd_get_current_parallel_handle) \ macro(ompd_get_enclosing_parallel_handle) \ -macro(ompd_get_task_enclosing_parallel_handle) \ +macro(ompd_get_task_parallel_handle) \ macro(ompd_release_parallel_handle) \ macro(ompd_parallel_handle_compare) \ -macro(ompd_get_top_task_region) \ -macro(ompd_get_ancestor_task_region) \ -macro(ompd_get_implicit_task_in_parallel) \ +macro(ompd_get_current_task_handle) \ +macro(ompd_get_generating_task_handle) \ +macro(ompd_get_task_in_parallel) \ macro(ompd_release_task_handle) \ macro(ompd_task_handle_compare) \ -macro(ompd_get_num_procs) \ -macro(ompd_get_thread_limit) \ -macro(ompd_get_num_threads) \ -macro(ompd_get_level) \ -macro(ompd_get_active_level) \ -macro(ompd_get_parallel_id) \ -macro(ompd_get_parallel_function) \ macro(ompd_get_thread_handle) \ -macro(ompd_get_osthread) \ +macro(ompd_enumerate_states) \ macro(ompd_get_state) \ -macro(ompd_get_max_threads) \ -macro(ompd_get_thread_num) \ -macro(ompd_in_parallel) \ -macro(ompd_in_final) \ -macro(ompd_get_dynamic) \ -macro(ompd_get_nested) \ -macro(ompd_get_max_active_levels) \ -macro(ompd_get_schedule) \ -macro(ompd_get_proc_bind) \ +macro(ompd_get_task_function) \ macro(ompd_get_task_frame) \ -macro(ompd_get_task_id) \ -macro(ompd_get_version) \ -macro(ompd_get_version_string) \ +macro(ompd_get_api_version) \ +macro(ompd_enumerate_icvs) \ +macro(ompd_get_icv_from_scope) \ namespace ompd_gdb { @@ -119,23 +86,72 @@ typedef struct FOREACH_OMPD_API_FN(OMPD_API_FUNCTION_POINTER_MEMBER) #undef OMPD_API_FUNCTION_POINTER_MEMBER -/* ompd_rc_t (*ompd_initialize) (ompd_callbacks_t *) = nullptr; - ompd_get_thread_handle_fn_t ompd_get_thread_handle = nullptr; - ompd_nesting_level_fn_t ompd_nesting_level = nullptr; - ompd_read_tmemory_fn_t ompd_read_tmemory = nullptr; -*/ - } OMPDFunctions; typedef std::shared_ptr OMPDFunctionsPtr; +class OMPDIcvs +{ +private: + OMPDFunctionsPtr functions; + std::map> availableIcvs; +public: + OMPDIcvs(OMPDFunctionsPtr functions, + ompd_address_space_handle_t *addrhandle); + ompd_rc_t get(ompd_parallel_handle_t *handle, const char *name, + ompd_word_t *value); +}; + +typedef std::shared_ptr OMPDIcvsPtr; + +class OMPDParallelHandleCmp +{ + OMPDFunctionsPtr functions; +public: + OMPDParallelHandleCmp(const OMPDFunctionsPtr &f) + : functions(f) {} + bool operator()(ompd_parallel_handle_t *a, ompd_parallel_handle_t *b) { + int cmp = 0; + functions->ompd_parallel_handle_compare(a, b, &cmp); + return cmp < 0; + } +}; + +class OMPDThreadHandleCmp +{ + OMPDFunctionsPtr functions; +public: + OMPDThreadHandleCmp(const OMPDFunctionsPtr &f) + : functions(f) {} + bool operator()(ompd_thread_handle_t *a, ompd_thread_handle_t *b) { + int cmp = 0; + functions->ompd_thread_handle_compare(a, b, &cmp); + return cmp < 0; + } +}; + +class OMPDTaskHandleCmp +{ + OMPDFunctionsPtr functions; +public: + OMPDTaskHandleCmp(const OMPDFunctionsPtr &f) + : functions(f) {} + bool operator()(ompd_task_handle_t *a, ompd_task_handle_t *b) { + int cmp = 0; + functions->ompd_task_handle_compare(a, b, &cmp); + return cmp < 0; + } +}; + class OMPDCommand; class OMPDCommandFactory { private: void * findFunctionInLibrary(const char *fun) const; + void initOmpd(); OMPDFunctionsPtr functions = nullptr; + OMPDIcvsPtr icvs = nullptr; // ompd_process_handle_t* prochandle = nullptr; ompd_address_space_handle_t* addrhandle = nullptr; OutputString out; @@ -144,7 +160,7 @@ class OMPDCommandFactory OMPDCommandFactory(); ~OMPDCommandFactory(); // OMPDCommand* create(const char *str) const; - OMPDCommand* create(const char *str, const std::vector& extraArgs=std::vector()) const; + OMPDCommand* create(const char *str, const std::vector& extraArgs=std::vector()); }; typedef std::unique_ptr OMPDCommandFactoryPtr; @@ -227,12 +243,14 @@ class OMPDThreads : public OMPDCommand class OMPDLevels : public OMPDCommand { + OMPDIcvsPtr icvs; public: ~OMPDLevels(){}; void execute() const; const char* toString() const; protected: - OMPDLevels(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const std::vector& args) : OMPDCommand(f, ah, args){}; + OMPDLevels(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const OMPDIcvsPtr &icvs, const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {}; friend OMPDCommandFactory; }; @@ -268,9 +286,46 @@ class OMPDTest : public OMPDCommand void execute() const; const char* toString() const; protected: - OMPDTest(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, const std::vector& args) : OMPDCommand(f, ah, args){}; + OMPDTest(const OMPDFunctionsPtr &f, ompd_address_space_handle_t* ah, + const OMPDIcvsPtr &icvs, const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {}; friend OMPDCommandFactory; +private: + OMPDIcvsPtr icvs; +}; + +class OMPDParallelRegions : public OMPDCommand +{ +public: + ~OMPDParallelRegions() {}; + void execute() const; + const char *toString() const; +protected: + OMPDParallelRegions(const OMPDFunctionsPtr &f, + ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs, + const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {} + + friend OMPDCommandFactory; +private: + OMPDIcvsPtr icvs; +}; + +class OMPDTasks : public OMPDCommand +{ +public: + ~OMPDTasks() {} + void execute() const; + const char *toString() const; +protected: + OMPDTasks(const OMPDFunctionsPtr &f, + ompd_address_space_handle_t *ah, const OMPDIcvsPtr &icvs, + const std::vector& args) + : OMPDCommand(f, ah, args), icvs(icvs) {} + friend OMPDCommandFactory; +private: + OMPDIcvsPtr icvs; }; } diff --git a/libompd/gdb-wrapper/OMPDContext.cpp b/libompd/gdb-wrapper/OMPDContext.cpp index 9b92e0d13..b344a1de4 100644 --- a/libompd/gdb-wrapper/OMPDContext.cpp +++ b/libompd/gdb-wrapper/OMPDContext.cpp @@ -139,7 +139,24 @@ ompd_thread_context_t * OMPDHostContext::getContextForThread(gdb_thread_id& thr_ bool OMPDCudaContext::setThisGdbContext() { - bool ret = false; + bool ret = true; + stringstream device_command; + stringstream coord_command; + device_command << "cuda device " << this->cudathread->coord.cudaDevId; + coord_command << "cuda grid " << this->cudathread->coord.gridId + << " block " << this->cudathread->coord.blockIdx.x + << " thread " << this->cudathread->coord.threadIdx.x; + OMPDContextPool::gdb->writeInput(device_command.str().c_str()); + string gdbOut = OMPDContextPool::gdb->readOutput(); + if (gdbOut.find("cannot be satisfied") != 0) + ret = false; + + OMPDContextPool::gdb->writeInput(coord_command.str().c_str()); + gdbOut = OMPDContextPool::gdb->readOutput(); + if (gdbOut.find("cannot be satisfied") != 0) + ret = false; + +#if 0 stringstream command; command #ifdef HACK_FOR_CUDA_GDB @@ -154,6 +171,7 @@ bool OMPDCudaContext::setThisGdbContext() string gdbOut = OMPDContextPool::gdb->readOutput(); if (gdbOut.find("not known")==0) ret = true; +#endif return ret; } diff --git a/libompd/gdb-wrapper/OMPDContext.h b/libompd/gdb-wrapper/OMPDContext.h index be3142439..89793543e 100644 --- a/libompd/gdb-wrapper/OMPDContext.h +++ b/libompd/gdb-wrapper/OMPDContext.h @@ -16,7 +16,7 @@ */ #include "ompd.h" -#include "ompd_test.h" +//#include "ompd_test.h" #include "GdbProcess.h" #include "Callbacks.h" #include "CudaGdb.h" diff --git a/libompd/gdb-wrapper/StringParser.cpp b/libompd/gdb-wrapper/StringParser.cpp index 0df120459..e3ebf3f93 100644 --- a/libompd/gdb-wrapper/StringParser.cpp +++ b/libompd/gdb-wrapper/StringParser.cpp @@ -224,7 +224,7 @@ vector StringParser::matchCudaThreadsInfo( coord.gridId = grid; coord.cudaContext = ctx; coord.cudaDevId = dev; - coord.kernelId = kernel; + coord.warpSize = 0; for (int b = 0; b < threadcounts.size(); ++b) { coord.blockIdx.x = b; diff --git a/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake b/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake new file mode 100644 index 000000000..14613ae4f --- /dev/null +++ b/libompd/gdb-wrapper/cmake/Modules/FindCudaGDB.cmake @@ -0,0 +1,60 @@ +# - Try to find GDB +# +# Once done, this will define: +# CUDA_GDB_FOUND - system has CUDA_GDB +# CUDA_GDB_COMMAND - the command to run +# CUDA_GDB_VERSION - version +# CUDA_GDB_HAS_RETURN_CHILD_RESULT - if the --return-child-result flag is supported +# +# Useful configuration variables you might want to add to your cache: +# CUDA_GDB_ROOT_DIR - A directory prefix to search +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + + +set(CUDA_GDB_ROOT_DIR + "${CUDA_GDB_ROOT_DIR}" + CACHE + PATH + "Directory to start our search in") + +find_program(CUDA_GDB_COMMAND + NAMES + cuda-gdb + HINTS + "${CUDA_GDB_ROOT_DIR}" + PATH_SUFFIXES + bin + libexec) + +if(CUDA_GDB_COMMAND) + execute_process(COMMAND cuda-gdb --version + COMMAND head -n 1 + OUTPUT_VARIABLE CUDA_GDB_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "[^0-9]*([0-9]+[0-9.]*).*" "\\1" CUDA_GDB_VERSION "${CUDA_GDB_VERSION}") +endif() + +# handle the QUIETLY and REQUIRED arguments and set xxx_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(CUDA_GDB DEFAULT_MSG CUDA_GDB_COMMAND CUDA_GDB_VERSION) + +if(CUDA_GDB_FOUND) + mark_as_advanced(CUDA_GDB_ROOT_DIR) + if(CUDA_GDB_VERSION VERSION_LESS 6.4) + set(CUDA_GDB_HAS_RETURN_CHILD_RESULT FALSE) + else() + set(CUDA_GDB_HAS_RETURN_CHILD_RESULT TRUE) + endif() +endif() + +mark_as_advanced(CUDA_GDB_COMMAND) diff --git a/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake b/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake new file mode 100644 index 000000000..a5f743da6 --- /dev/null +++ b/libompd/gdb-wrapper/cmake/Modules/FindGDB.cmake @@ -0,0 +1,60 @@ +# - Try to find GDB +# +# Once done, this will define: +# GDB_FOUND - system has GDB +# GDB_COMMAND - the command to run +# GDB_VERSION - version +# GDB_HAS_RETURN_CHILD_RESULT - if the --return-child-result flag is supported +# +# Useful configuration variables you might want to add to your cache: +# GDB_ROOT_DIR - A directory prefix to search +# +# Original Author: +# 2009-2010 Ryan Pavlik +# http://academic.cleardefinition.com +# Iowa State University HCI Graduate Program/VRAC +# +# Copyright Iowa State University 2009-2010. +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + + +set(GDB_ROOT_DIR + "${GDB_ROOT_DIR}" + CACHE + PATH + "Directory to start our search in") + +find_program(GDB_COMMAND + NAMES + gdb + HINTS + "${GDB_ROOT_DIR}" + PATH_SUFFIXES + bin + libexec) + +if(GDB_COMMAND) + execute_process(COMMAND gdb --version + COMMAND head -n 1 + OUTPUT_VARIABLE GDB_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "[^0-9]*([0-9]+[0-9.]*).*" "\\1" GDB_VERSION "${GDB_VERSION}") +endif() + +# handle the QUIETLY and REQUIRED arguments and set xxx_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GDB DEFAULT_MSG GDB_COMMAND GDB_VERSION) + +if(GDB_FOUND) + mark_as_advanced(GDB_ROOT_DIR) + if(GDB_VERSION VERSION_LESS 6.4) + set(GDB_HAS_RETURN_CHILD_RESULT FALSE) + else() + set(GDB_HAS_RETURN_CHILD_RESULT TRUE) + endif() +endif() + +mark_as_advanced(GDB_COMMAND) diff --git a/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake b/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake new file mode 100644 index 000000000..745cfe583 --- /dev/null +++ b/libompd/gdb-wrapper/cmake/Modules/FindReadline.cmake @@ -0,0 +1,47 @@ +# - Try to find readline include dirs and libraries +# +# Usage of this module as follows: +# +# find_package(Readline) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Readline_ROOT_DIR Set this variable to the root installation of +# readline if the module has problems finding the +# proper installation path. +# +# Variables defined by this module: +# +# READLINE_FOUND System has readline, include and lib dirs found +# Readline_INCLUDE_DIR The readline include directories. +# Readline_LIBRARY The readline library. + +find_path(Readline_ROOT_DIR + NAMES include/readline/readline.h +) + +find_path(Readline_INCLUDE_DIR + NAMES readline/readline.h + HINTS ${Readline_ROOT_DIR}/include +) + +find_library(Readline_LIBRARY + NAMES readline + HINTS ${Readline_ROOT_DIR}/lib +) + +if(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY) + set(READLINE_FOUND TRUE) +else(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY) + FIND_LIBRARY(Readline_LIBRARY NAMES readline) + include(FindPackageHandleStandardArgs) + FIND_PACKAGE_HANDLE_STANDARD_ARGS(Readline DEFAULT_MSG Readline_INCLUDE_DIR Readline_LIBRARY ) + MARK_AS_ADVANCED(Readline_INCLUDE_DIR Readline_LIBRARY) +endif(Readline_INCLUDE_DIR AND Readline_LIBRARY AND Ncurses_LIBRARY) + +mark_as_advanced( + Readline_ROOT_DIR + Readline_INCLUDE_DIR + Readline_LIBRARY +) diff --git a/libompd/gdb-wrapper/ompd_typedefs.h b/libompd/gdb-wrapper/ompd_typedefs.h new file mode 100644 index 000000000..825916434 --- /dev/null +++ b/libompd/gdb-wrapper/ompd_typedefs.h @@ -0,0 +1,200 @@ +#include "ompd.h" + + +/* 4.3.4.1 + * Global initialization and finalization + */ + + +typedef ompd_rc_t (*ompd_initialize_fn_t) ( + ompd_word_t api_version, + const ompd_callbacks_t *callbacks +); + +typedef ompd_rc_t (*ompd_get_api_version_fn_t) ( + ompd_word_t *version +); + +typedef ompd_rc_t (*ompd_get_version_string_fn_t) ( + const char **string +); + +typedef ompd_rc_t (*ompd_finalize_fn_t) (void); + +/* 4.3.4.2 + * Per OpenMP Process Initialiyation and Finalization + */ + +typedef ompd_rc_t (*ompd_process_initialize_fn_t) ( + ompd_address_space_context_t *context, + ompd_address_space_handle_t **handle + ); + +typedef ompd_rc_t (*ompd_device_initialize_fn_t) ( + ompd_address_space_handle_t *process_handle, /*IN: address space of the OpenMP process*/ + ompd_address_space_context_t *device_context, /*IN: Opaque tool handle for device address space*/ + ompd_device_t kind, /*IN: device identifier kind*/ + ompd_size_t sizeof_id, /*IN: size of device identifier*/ + void *id, /*IN: device identifier*/ + ompd_address_space_handle_t **device_handle /*OUT: device handle*/ + ); + + +typedef ompd_rc_t (*ompd_release_address_space_handle_fn_t) ( + ompd_address_space_handle_t *addr_handle /* IN: handle for the address space */ + ); + +/* 4.3.4.4 + * Address space information + */ + +typedef ompd_rc_t (*ompd_get_omp_version_fn_t) ( + ompd_address_space_handle_t *address_space, + ompd_word_t *omp_version + ); + +typedef ompd_rc_t (*ompd_get_omp_version_string_fn_t) ( + ompd_address_space_handle_t *address_space, + const char **string + ); + +/* 4.3.4.5 + * Thread Handles + */ + +typedef ompd_rc_t (*ompd_get_thread_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /*IN: handle for the parallel region*/ + int thread_num, /*IN: the nubmer of the thread that is returned*/ + ompd_thread_handle_t **thread_hanlde /*OUT: returned thread handle*/ + ); + + +typedef ompd_rc_t (*ompd_get_thread_handle_fn_t) ( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_t kind, + ompd_size_t sizeof_osthread, + const void* osthread, + ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ + ); + +typedef ompd_rc_t (*ompd_release_thread_handle_fn_t) ( + ompd_thread_handle_t *thread_handle +); + +typedef ompd_rc_t (*ompd_thread_handle_compare_fn_t) ( + ompd_thread_handle_t *thread_handle_1, + ompd_thread_handle_t *thread_handle_2, + int *cmp_value +); + +typedef ompd_rc_t (*ompd_get_thread_id_fn_t) ( + ompd_thread_handle_t *thread_handle, + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, + void *thread_id + ); + +/* 4.3.4.6 + * Parallel Region Handles + */ + +typedef ompd_rc_t (*ompd_get_current_parallel_handle_fn_t) ( + ompd_thread_handle_t *thread_handle, + ompd_parallel_handle_t **parallel_handle + ); + +typedef ompd_rc_t (*ompd_get_enclosing_parallel_handle_fn_t) ( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_parallel_handle_t **enclosing_parallel_handle /* OUT: OpenMP parallel handle */ + ); + +typedef ompd_rc_t (*ompd_get_task_parallel_handle_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_parallel_handle_t **task_parallel_handle + ); + +typedef ompd_rc_t (*ompd_release_parallel_handle_fn_t) ( + ompd_parallel_handle_t *parallel_handle + ); + +typedef ompd_rc_t (*ompd_parallel_handle_compare_fn_t) ( + ompd_parallel_handle_t *parallel_handle_1, + ompd_parallel_handle_t *parallel_handle_2, + int *cmp_value + ); + +/* 4.3.4.7 + * Task Handles + */ + +typedef ompd_rc_t (*ompd_get_current_task_handle_fn_t) ( + ompd_thread_handle_t *thread_handle, + ompd_task_handle_t **task_handle + ); + +typedef ompd_rc_t (*ompd_get_generating_task_handle_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_task_handle_t **generating_task_handle + ); + +typedef ompd_rc_t (*ompd_get_scheduling_task_handle_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_task_handle_t **scheduling_task_handle + ); + +typedef ompd_rc_t (*ompd_get_task_in_parallel_fn_t) ( + ompd_parallel_handle_t *parallel_handle, + int thread_num, + ompd_task_handle_t **task_handle + ); + +typedef ompd_rc_t (*ompd_release_task_handle_fn_t) ( + ompd_task_handle_t *task_handle +); + +typedef ompd_rc_t (*ompd_task_handle_compare_fn_t) ( + ompd_task_handle_t *task_handle_1, + ompd_task_handle_t *task_handle_2, + int *cmp_value +); + +typedef ompd_rc_t (*ompd_get_task_function_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_address_t *entry_point + ); + +typedef ompd_rc_t (*ompd_get_task_frame_fn_t) ( + ompd_task_handle_t *task_handle, + ompd_address_t *exit_frame, + ompd_address_t *enter_frame + ); + +typedef ompd_rc_t (*ompd_enumerate_states_fn_t) ( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, + ompd_word_t *next_state, + const char **next_state_name, + ompd_word_t *more_enums + ); + +typedef ompd_rc_t (*ompd_get_state_fn_t) ( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *state, /* OUT: State of this thread */ + ompd_wait_id_t *wait_id /* OUT: Wait ID */ + ); + +typedef ompd_rc_t (*ompd_enumerate_icvs_fn_t) ( + ompd_address_space_handle_t *handle, + ompd_icv_id_t current, + ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, + int *more + ); + +typedef ompd_rc_t (*ompd_get_icv_from_scope_fn_t) ( + void *handle, + ompd_scope_t scope, + ompd_icv_id_t icv_id, + ompd_word_t *icv_value + ); diff --git a/libompd/src/CMakeLists.txt b/libompd/src/CMakeLists.txt index 0fb4e6b0f..5ffc44035 100644 --- a/libompd/src/CMakeLists.txt +++ b/libompd/src/CMakeLists.txt @@ -1,6 +1,6 @@ project (libompd) -add_library (ompd SHARED TargetValue.cpp omp-debug.cpp) +add_library (ompd SHARED TargetValue.cpp omp-debug.cpp omp-state.cpp omp-icv.cpp) add_dependencies(ompd omp) # ensure generated import library is created first diff --git a/libompd/src/TargetValue.cpp b/libompd/src/TargetValue.cpp index 43a394f83..d1de522fd 100644 --- a/libompd/src/TargetValue.cpp +++ b/libompd/src/TargetValue.cpp @@ -6,9 +6,14 @@ #include const ompd_callbacks_t *TValue::callbacks = NULL; -ompd_target_type_sizes_t TValue::type_sizes; +ompd_device_type_sizes_t TValue::type_sizes; +// MARKER_MR: This is just compat stuff because I dont have time to +// replace this function. TODO: replace this function inline int ompd_sizeof(ompd_target_prim_types_t t) { + assert(t != ompd_type_max && "ompd_type_max should not be used anywhere"); + assert(t != ompd_type_invalid && "request size of invalid type"); + return (((char *)&TValue::type_sizes)[(int)t]); } @@ -44,7 +49,14 @@ ompd_rc_t TType::getSize(ompd_size_t *size) { ompd_size_t tmpSize; std::stringstream ss; ss << "ompd_sizeof__" << typeName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + + // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr) + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL || + descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) { + ss << "_"; + } + + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -52,15 +64,25 @@ ompd_rc_t TType::getSize(ompd_size_t *size) { << ") \\" << std::endl; return ret; } + symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + // On cuda targets, ompd_sizeof_ and ompd_access_ symbols are alwazs in + // shared memory. + // This is a hack to ensure that we are not looking in global memory for + // it + // TODO (mr): Find a better solution + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) { + symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED; + } + + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpSize)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host( - context, &tmpSize, ompd_sizeof(ompd_type_long_long), 1, &(typeSize)); + ret = TValue::callbacks->device_to_host( + context, &tmpSize, TValue::type_sizes.sizeof_long_long, 1, &(typeSize)); } *size = typeSize; return ret; @@ -77,7 +99,7 @@ ompd_rc_t TType::getBitfieldMask(const char *fieldName, // &fieldOffset); std::stringstream ss; ss << "ompd_bitfield__" << typeName << "__" << fieldName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -87,14 +109,14 @@ ompd_rc_t TType::getBitfieldMask(const char *fieldName, } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpMask)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host(context, &(tmpMask), - ompd_sizeof(ompd_type_long_long), 1, - &(bitfieldMask)); + ret = TValue::callbacks->device_to_host(context, &(tmpMask), + TValue::type_sizes.sizeof_long_long, + 1, &(bitfieldMask)); if (ret != ompd_rc_ok) { return ret; } @@ -114,7 +136,14 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) { // &fieldOffset); std::stringstream ss; ss << "ompd_access__" << typeName << "__" << fieldName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + + // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr) + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL || + descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) { + ss << "_"; + } + + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -124,14 +153,23 @@ ompd_rc_t TType::getElementOffset(const char *fieldName, ompd_size_t *offset) { } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + // On cuda targets, ompd_sizeof_ and ompd_access_ symbols are alwazs in + // shared memory. + // This is a hack to ensure that we are not looking in global memory for + // it + // TODO (mr): Find a better solution + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) { + symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED; + } + + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpOffset)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host(context, &(tmpOffset), - ompd_sizeof(ompd_type_long_long), 1, - &fieldOffset); + ret = TValue::callbacks->device_to_host(context, &(tmpOffset), + TValue::type_sizes.sizeof_long_long, + 1, &fieldOffset); if (ret != ompd_rc_ok) { return ret; } @@ -151,7 +189,14 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) { // &fieldOffset); std::stringstream ss; ss << "ompd_sizeof__" << typeName << "__" << fieldName; - ret = TValue::callbacks->tsymbol_addr(context, NULL, ss.str().c_str(), + + // HACK FOR NAME MANGLING ISSUE IN CUDA-GDB (mr) + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL || + descSegment == OMPD_SEGMENT_CUDA_PTX_SHARED) { + ss << "_"; + } + + ret = TValue::callbacks->symbol_addr_lookup(context, NULL, ss.str().c_str(), &symbolAddr); if (ret != ompd_rc_ok) { dout << "missing symbol " << ss.str() @@ -161,13 +206,23 @@ ompd_rc_t TType::getElementSize(const char *fieldName, ompd_size_t *size) { } symbolAddr.segment = descSegment; - ret = TValue::callbacks->read_tmemory(context, NULL, symbolAddr, - 1 * ompd_sizeof(ompd_type_long_long), + // On cuda targets, ompd_sizeof_ and ompd_access_ symbols are alwazs in + // shared memory. + // This is a hack to ensure that we are not looking in global memory for + // it + // TODO (mr): Find a better solution + if (descSegment == OMPD_SEGMENT_CUDA_PTX_GLOBAL) { + symbolAddr.segment = OMPD_SEGMENT_CUDA_PTX_SHARED; + } + + ret = TValue::callbacks->read_memory(context, NULL, symbolAddr, + 1 * TValue::type_sizes.sizeof_long_long, &(tmpOffset)); if (ret != ompd_rc_ok) return ret; - ret = TValue::callbacks->target_to_host( - context, &tmpOffset, ompd_sizeof(ompd_type_long_long), 1, &fieldSize); + ret = TValue::callbacks->device_to_host(context, &tmpOffset, + TValue::type_sizes.sizeof_long_long, + 1, &fieldSize); if (ret != ompd_rc_ok) { return ret; } @@ -209,7 +264,7 @@ TValue::TValue(ompd_address_space_context_t *_context, /*valueName(_valueName),*/ context(_context), tcontext(_tcontext), fieldSize(0) { errorState.errorCode = - callbacks->tsymbol_addr(context, tcontext, _valueName, &symbolAddr); + callbacks->symbol_addr_lookup(context, tcontext, _valueName, &symbolAddr); symbolAddr.segment = segment; // assert((ret==ompd_rc_ok) && "Callback call failed"); } @@ -257,14 +312,14 @@ TValue TValue::dereference() const { assert(pointerLevel > 0 && "cannot dereference non-pointer"); TValue ret = *this; ret.pointerLevel--; - ret.errorState.errorCode = callbacks->read_tmemory( - context, tcontext, symbolAddr, 1 * ompd_sizeof(ompd_type_pointer), + ret.errorState.errorCode = callbacks->read_memory( + context, tcontext, symbolAddr, 1 * TValue::type_sizes.sizeof_pointer, &(tmpAddr.address)); if (ret.errorState.errorCode != ompd_rc_ok) return ret; - ret.errorState.errorCode = callbacks->target_to_host( - context, &(tmpAddr.address), ompd_sizeof(ompd_type_pointer), 1, + ret.errorState.errorCode = callbacks->device_to_host( + context, &(tmpAddr.address), TValue::type_sizes.sizeof_pointer, 1, &(ret.symbolAddr.address)); if (ret.errorState.errorCode != ompd_rc_ok) { return ret; @@ -290,7 +345,7 @@ ompd_rc_t TValue::getRawValue(void *buf, int count) { return errorState.errorCode; errorState.errorCode = - callbacks->read_tmemory(context, tcontext, symbolAddr, size, buf); + callbacks->read_memory(context, tcontext, symbolAddr, size, buf); return errorState.errorCode; } @@ -307,7 +362,11 @@ TBaseValue TValue::castBase(const char *varName) { return TBaseValue(*this, size); } -TBaseValue TValue::castBase() const { return TBaseValue(*this, fieldSize); } +TBaseValue TValue::castBase() const { + if(pointerLevel>0) + return TBaseValue(*this, type_sizes.sizeof_pointer); + return TBaseValue(*this, fieldSize); +} TBaseValue TValue::castBase(ompd_target_prim_types_t baseType) const { return TBaseValue(*this, baseType); @@ -345,7 +404,12 @@ ompd_rc_t TValue::check(const char *bitfieldName, ompd_word_t *isSet) const { TValue TValue::getArrayElement(int elemNumber) const { if (gotError()) return *this; - TValue ret = dereference(); + TValue ret; + if (pointerLevel > 0) { + ret = dereference(); + } else { + ret = *this; + } if (ret.pointerLevel == 0) { ompd_size_t size; ret.errorState.errorCode = type->getSize(&size); @@ -356,6 +420,16 @@ TValue TValue::getArrayElement(int elemNumber) const { return ret; } +TValue TValue::getPtrArrayElement(int elemNumber) const { + if (gotError()) { + return *this; + } + assert(pointerLevel > 0 && "This only works on arrays of pointers"); + TValue ret = *this; + ret.symbolAddr.address += elemNumber * type_sizes.sizeof_pointer; + return ret; +} + TBaseValue::TBaseValue(const TValue &_tvalue, ompd_target_prim_types_t _baseType) : TValue(_tvalue), baseTypeSize(ompd_sizeof(_baseType)) {} @@ -365,12 +439,12 @@ TBaseValue::TBaseValue(const TValue &_tvalue, ompd_size_t _baseTypeSize) ompd_rc_t TBaseValue::getValue(void *buf, int count) { if (errorState.errorCode != ompd_rc_ok) return errorState.errorCode; - errorState.errorCode = callbacks->read_tmemory(context, tcontext, symbolAddr, + errorState.errorCode = callbacks->read_memory(context, tcontext, symbolAddr, count * baseTypeSize, buf); if (errorState.errorCode != ompd_rc_ok) return errorState.errorCode; errorState.errorCode = - callbacks->target_to_host(context, buf, baseTypeSize, count, buf); + callbacks->device_to_host(context, buf, baseTypeSize, count, buf); return errorState.errorCode; } @@ -378,7 +452,7 @@ ompd_rc_t TBaseValue::getValue(void *buf, int count) { // { // if( errorState.errorCode != ompd_rc_ok ) // return errorState.errorCode; -// errorState.errorCode = callbacks->read_tmemory(context, tcontext, +// errorState.errorCode = callbacks->read_memory(context, tcontext, // symbolAddr, // count, baseType, &(buf->th)); // assert((errorState.errorCode == ompd_rc_ok) && "Callback call failed"); diff --git a/libompd/src/TargetValue.h b/libompd/src/TargetValue.h index cbf8a4f9f..cf14ea716 100644 --- a/libompd/src/TargetValue.h +++ b/libompd/src/TargetValue.h @@ -1,5 +1,6 @@ #include "ompd.h" +#include "ompd-private.h" #include #ifndef SRC_TARGET_VALUE_H_ @@ -100,7 +101,7 @@ class TValue { public: static const ompd_callbacks_t *callbacks; - static ompd_target_type_sizes_t type_sizes; + static ompd_device_type_sizes_t type_sizes; TValue() : errorState(ompd_rc_error) {} /** @@ -185,6 +186,10 @@ class TValue { * Get an array element */ TValue getArrayElement(int elemNumber) const; + /** + * Get an element of a pointer arraz + */ + TValue getPtrArrayElement(int elemNumber) const; /** * Did we raise some error yet? */ diff --git a/libompd/src/omp-debug.cpp b/libompd/src/omp-debug.cpp index 325872273..fae2b63cc 100644 --- a/libompd/src/omp-debug.cpp +++ b/libompd/src/omp-debug.cpp @@ -15,27 +15,27 @@ #include "omp-debug.h" #include "omp.h" -#include "ompd.h" -// #include +#include "ompd-private.h" #include "TargetValue.h" #include #include #include #include #include -#include -ompd_target_type_sizes_t type_sizes; +ompd_device_type_sizes_t type_sizes; uint64_t ompd_state; /* --- OMPD functions ------------------------------------------------------- */ -/* --- 3 Initialization ----------------------------------------------------- */ +/* --- 1 Initialization ----------------------------------------------------- */ -ompd_rc_t ompd_initialize(const ompd_callbacks_t *table, ompd_word_t version) { +ompd_rc_t ompd_initialize(ompd_word_t version, const ompd_callbacks_t *table) { ompd_rc_t ret = table ? ompd_rc_ok : ompd_rc_bad_input; callbacks = table; TValue::callbacks = table; + __ompd_init_icvs(table); + __ompd_init_states(table); return ret; } @@ -57,36 +57,28 @@ ompd_process_initialize(ompd_address_space_context_t ompd_rc_t ret = initTypeSizes(context); if (ret != ompd_rc_ok) return ret; - ret = TValue(context, "ompd_rtl_version") - .castBase(ompd_type_int) - .getValue(rtl_version); - if ((ret == ompd_rc_ok && rtl_version < 5) || - ret == ompd_rc_target_read_error) - return ompd_rc_incompatible; - if (ret != ompd_rc_ok) - return ret; + ret = TValue(context, "ompd_state") .castBase(ompd_type_long_long) .getValue(ompd_state); if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_address_space_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_address_space_handle_t), (void **)(addrhandle)); if (ret != ompd_rc_ok) return ret; -// *addrhandle = new ompd_address_space_handle_t; if (!addrhandle) return ompd_rc_error; (*addrhandle)->context = context; - (*addrhandle)->kind = ompd_device_kind_host; + (*addrhandle)->kind = OMPD_DEVICE_KIND_HOST; return ompd_rc_ok; } ompd_rc_t -ompd_get_openmp_version(ompd_address_space_handle_t +ompd_get_omp_version(ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_word_t *version) { + ompd_word_t *version) { if (!addr_handle) return ompd_rc_stale_handle; ompd_address_space_context_t *context = addr_handle->context; @@ -103,7 +95,7 @@ ompd_get_openmp_version(ompd_address_space_handle_t return ret; } -ompd_rc_t ompd_get_openmp_version_string( +ompd_rc_t ompd_get_omp_version_string( ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ const char **string) { @@ -121,48 +113,38 @@ ompd_rc_t ompd_release_address_space_handle( if (!addr_handle) return ompd_rc_bad_input; - ompd_rc_t ret = callbacks->dmemory_free((void *)(addr_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(addr_handle)); // delete addr_handle; return ret; } -#if 0 // no device support yet -ompd_rc_t ompd_device_initialize ( - ompd_address_space_context_t *context, /* IN: */ - ompd_device_identifier_t id, /* IN: object defined by native device API */ - ompd_device_kind_t kind, /* IN: */ - ompd_address_space_handle_t **addrhandle /* OUT: ompd handle for the device */ +ompd_rc_t ompd_device_initialize( + ompd_address_space_handle_t *process_handle, + ompd_address_space_context_t *device_context, + int kind, + ompd_size_t sizeof_id, + void *id, + ompd_address_space_handle_t **device_handle ) { - if (!context) + if (!device_context) return ompd_rc_bad_input; - ompd_rc_t ret = initTypeSizes(context); - if (ret != ompd_rc_ok) - return ret; - + ompd_rc_t ret; uint64_t ompd_num_cuda_devices; - ompd_address_space_context_t *process_context; - ret = callbacks->get_containing_process_context(context, &process_context); - if ( ret != ompd_rc_ok ) - return ret; - - ret = TValue(process_context, "ompd_num_cuda_devices"). + ret = TValue(process_handle->context, "ompd_num_cuda_devices"). castBase(ompd_type_long_long). getValue(ompd_num_cuda_devices); - if (ret != ompd_rc_ok) { + if (ret != ompd_rc_ok) return ret; - } + for (uint64_t i = 0; i < ompd_num_cuda_devices; i++) { uint64_t cuda_ctx; - /* TODO(mjm) - Hack! Currently using ompt_parallel_id_t. Need to find a - * place to define ID type information for CUDA contexts - */ - ret = TValue(process_context, "ompd_CudaContextArray"). - cast("ompt_parallel_id_t",1). + ret = TValue(process_handle->context, "ompd_CudaContextArray"). + cast("ompd_cuda_context_ptr_t",1). getArrayElement(i). castBase(ompd_type_long_long). getValue(cuda_ctx); @@ -170,28 +152,24 @@ ompd_rc_t ompd_device_initialize ( if ( ret != ompd_rc_ok ) continue; - if (cuda_ctx == id) { - ret = callbacks->dmemory_alloc(sizeof(ompd_address_space_handle_t), - (void **)(addrhandle)); + if (cuda_ctx == *((uint64_t *)id)) { + ret = callbacks->memory_alloc(sizeof(ompd_address_space_handle_t), + (void **)(device_handle)); if (ret != ompd_rc_ok) return ret; -// *addrhandle = new ompd_address_space_handle_t; - if (!addrhandle) + if (!device_handle) return ompd_rc_error; - (*addrhandle)->context = context; - + (*device_handle)->context = device_context; + (*device_handle)->kind = OMPD_DEVICE_KIND_CUDA; + (*device_handle)->id = (uint64_t)id; return ompd_rc_ok; } } - /* TODO(mjm) - Find appropriate error return result for not finding a match */ - return ompd_rc_ok; + return ompd_rc_unavailable; } -#endif // no device support - -/* --- 4 Handle Management -------------------------------------------------- */ -/* --- 4.1 Thread Handles --------------------------------------------------- */ +/* --- 4.5 Thread Handles --------------------------------------------------- */ /* thread_handle is of type (kmp_base_info_t) */ @@ -206,32 +184,75 @@ ompd_rc_t ompd_get_thread_in_parallel( return ompd_rc_stale_handle; ompd_address_space_context_t *context = parallel_handle->ah->context; ompd_rc_t ret; - int i; if (!context) return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; + + if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + uint16_t thread_idx; + // We cannot use the task descriptor associated with the parallel info as + // their task might not be currently active + // So to get the current thread, we access the tasks thread info and get + // get its threadIdx.x + auto TaskDescr = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getArrayElement(nth_handle); + + ret = TaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("threadIdx_x") + .castBase(ompd_type_short) + .getValue(thread_idx); + + if (ret != ompd_rc_ok) { + return ret; + } - ret = TValue(context, parallel_handle->th) /* t */ - .cast("kmp_base_team_t", 0) - .access("t_threads") /*t.t_threads*/ - .cast("kmp_info_t", 2) - .getArrayElement(nth_handle) /*t.t_threads[nth_handle]*/ - .access("th") /*t.t_threads[i]->th*/ + ret = TValue(context, NULL, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("topTaskDescr") + .cast("omptarget_nvptx_TaskDescr", 2, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getPtrArrayElement(thread_idx) + .dereference() .getAddress(&taddr); + if (taddr.address == 0 && thread_idx % 32 == 0) { + ret = TaskDescr.getAddress(&taddr); + } + } else { + ret = TValue(context, parallel_handle->th) /* t */ + .cast("kmp_base_team_t", 0) + .access("t_threads") /*t.t_threads*/ + .cast("kmp_info_t", 2) + .getArrayElement(nth_handle) /*t.t_threads[nth_handle]*/ + .access("th") /*t.t_threads[i]->th*/ + .getAddress(&taddr); + } + if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), (void **)(thread_handle)); if (ret != ompd_rc_ok) return ret; (*thread_handle)->th = taddr; (*thread_handle)->ah = parallel_handle->ah; + (*thread_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info; return ret; } @@ -240,7 +261,7 @@ ompd_rc_t ompd_release_thread_handle( ) { if (!thread_handle) return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(thread_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(thread_handle)); if (ret != ompd_rc_ok) return ret; return ompd_rc_ok; @@ -253,30 +274,30 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, return ompd_rc_stale_handle; if (!thread_handle_2) return ompd_rc_stale_handle; + if (thread_handle_1->ah->kind != thread_handle_2->ah->kind) + return ompd_rc_bad_input; *cmp_value = thread_handle_1->th.address - thread_handle_2->th.address; - return ompd_rc_ok; -} + if (*cmp_value == 0 && thread_handle_1->ah->kind == OMPD_DEVICE_KIND_CUDA) { + *cmp_value = thread_handle_1->cuda_kernel_info->cudaDevId - + thread_handle_2->cuda_kernel_info->cudaDevId; + if (*cmp_value == 0) { + *cmp_value = thread_handle_1->cuda_kernel_info->cudaContext - + thread_handle_2->cuda_kernel_info->cudaContext; + } + if (*cmp_value == 0) { + *cmp_value = thread_handle_1->cuda_kernel_info->warpSize - + thread_handle_2->cuda_kernel_info->warpSize; + } + if (*cmp_value == 0) { + *cmp_value = thread_handle_1->cuda_kernel_info->gridId - + thread_handle_2->cuda_kernel_info->gridId; + } + } -#if 0 -ompd_rc_t ompd_get_thread_handle_string_id ( - ompd_thread_handle_t *thread_handle, - char **string_id - ) -{ - pthread_t thread_id; - ompd_rc_t ret; - ret = ompd_get_thread_id(thread_handle, ompd_thread_id_pthread, sizeof(pthread_t), &thread_id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)thread_id); return ompd_rc_ok; } -#endif -/* --- 4.2 Parallel Region Handles------------------------------------------- */ +/* --- 4.6 Parallel Region Handles------------------------------------------- */ /* parallel_handle is of type (kmp_base_team_t)*/ @@ -289,38 +310,100 @@ ompd_rc_t ompd_get_current_parallel_handle( if (!thread_handle->ah) return ompd_rc_stale_handle; ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) + ompd_thread_context_t *thread_context = thread_handle->thread_context; + if (!context || !thread_context) return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; - TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_team") /*__kmp_threads[t]->th.th_team*/ - .cast("kmp_team_p", 1) - .access("t"); /*__kmp_threads[t]->th.th_team->t*/ + ompd_rc_t ret; - ompd_rc_t ret = teamdata.getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; + if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + ompd_address_t taddr; + TValue ph; + // The ompd_parallel_info_t we need is only present in the previous task + // of an implicit task. + uint16_t task_is_implicit = 0; + ret = ompd_rc_ok; + auto possibleTaskDescr = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + + while (!task_is_implicit && ret == ompd_rc_ok) { + ret = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("task_implicit") + .castBase() + .getValue(task_is_implicit); + possibleTaskDescr = possibleTaskDescr.access("prev") + .cast("omptarget_nvptx_TaskDescr", + 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL); + ret = possibleTaskDescr.dereference().getAddress(&taddr); + } - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = teamdata.cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) - return ret; + if (ret != ompd_rc_ok) { + if (taddr.address == 0) { + ph = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext") + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_levelZeroParallelInfo") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } else { + return ret; + } + } else { + ph = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("enclosed_parallel") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } + + ret = ph.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), (void **)(parallel_handle)); - if (ret != ompd_rc_ok) - return ret; + if (ret != ompd_rc_ok) + return ret; + + (*parallel_handle)->ah = thread_handle->ah; + (*parallel_handle)->th = taddr; + (*parallel_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info; + } else { + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; - (*parallel_handle)->ah = thread_handle->ah; - (*parallel_handle)->th = taddr; - (*parallel_handle)->lwt = lwt; + TValue teamdata = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_team") /*__kmp_threads[t]->th.th_team*/ + .cast("kmp_team_p", 1) + .access("t"); /*__kmp_threads[t]->th.th_team->t*/ + + ret = teamdata.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = teamdata.cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), + (void **)(parallel_handle)); + if (ret != ompd_rc_ok) + return ret; + + (*parallel_handle)->ah = thread_handle->ah; + (*parallel_handle)->th = taddr; + (*parallel_handle)->lwt = lwt; + } return ompd_rc_ok; } @@ -339,47 +422,124 @@ ompd_rc_t ompd_get_enclosing_parallel_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = parallel_handle->th, lwt; - ompd_rc_t ret = ompd_rc_stale_handle; - TValue lwtValue = TValue(context, parallel_handle->lwt); - if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 - { // if we are in lwt, get parent - ret = lwtValue.cast("ompt_lw_taskteam_t", 0) - .access("parent") - .cast("ompt_lw_taskteam_t", 1) - .dereference() - .getAddress(&lwt); - } - if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + ompd_address_t taddr = parallel_handle->th, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; + ompd_rc_t ret; - TValue teamdata = - TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_parent") /*t.t_parent*/ - .cast("kmp_team_p", 1) - .access("t"); /*t.t_parent->t*/ + if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + uint16_t level; + TValue curParallelInfo = TValue(context, taddr) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_SHARED); + + ret = curParallelInfo + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("level") + .castBase(ompd_type_short) + .getValue(level); - ret = teamdata.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = teamdata.cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); - if (ret != ompd_rc_ok) + TValue prevTaskDescr = curParallelInfo.cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .dereference(); + + ret = prevTaskDescr.getAddress(&taddr); + + // If the previous task of the tasks of the current parallel region is + // NULL, then we got the parallel handle for the (implicit?) top level + // task which has no enclosing task. + if (ret != ompd_rc_ok) { + return ret; + } + + // The instance of TaskDescr for the previous task contains the parallel + // info for the current parallel region. So we have to go back to the + // previous task of the previous task + prevTaskDescr = prevTaskDescr.access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .dereference(); + + ret = prevTaskDescr.getAddress(&taddr); + + if (ret != ompd_rc_ok) { + if (taddr.address == 0 && level == 1) { + // If we are in generic mode, there is an implicit parallel region + // around the master thread + prevTaskDescr = TValue(context, NULL, "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_levelZeroParallelInfo"); + } else { + return ret; + } + } else { + prevTaskDescr = prevTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("enclosed_parallel"); + } + + ret = prevTaskDescr.cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getAddress(&taddr); + + if (ret != ompd_rc_ok) { return ret; + } + } else { + ret = ompd_rc_stale_handle; + TValue lwtValue = TValue(context, parallel_handle->lwt); + if (lwtValue.getError() == ompd_rc_ok) // lwt == 0x0 + { // if we are in lwt, get parent + ret = lwtValue.cast("ompt_lw_taskteam_t", 0) + .access("parent") + .cast("ompt_lw_taskteam_t", 1) + .dereference() + .getAddress(&lwt); + } + if (ret != ompd_rc_ok) { // no lwt or parent==0x0 + + TValue teamdata = + TValue(context, parallel_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_parent") /*t.t_parent*/ + .cast("kmp_team_p", 1) + .access("t"); /*t.t_parent->t*/ + + ret = teamdata.getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; + ret = teamdata.cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + if (ret != ompd_rc_ok) + return ret; + } } - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), (void **)(enclosing_parallel_handle)); if (ret != ompd_rc_ok) return ret; (*enclosing_parallel_handle)->th = taddr; (*enclosing_parallel_handle)->lwt = lwt; (*enclosing_parallel_handle)->ah = parallel_handle->ah; + (*enclosing_parallel_handle)->cuda_kernel_info = + parallel_handle->cuda_kernel_info; return ompd_rc_ok; } @@ -398,20 +558,66 @@ ompd_rc_t ompd_get_task_parallel_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret; - ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .getAddress(&taddr); + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + TValue parallelHandle; + // The ompd_parallel_info_t we need is only present in the previous task + // of an implicit task. + uint16_t task_is_implicit = 0; + ret = ompd_rc_ok; + auto possibleTaskDescr = TValue(context, task_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + + while (!task_is_implicit && ret == ompd_rc_ok) { + ret = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("task_implicit") + .castBase() + .getValue(task_is_implicit); + possibleTaskDescr = possibleTaskDescr.access("prev") + .cast("omptarget_nvptx_TaskDescr", + 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL); + ret = possibleTaskDescr.dereference().getAddress(&taddr); + } + + if (ret != ompd_rc_ok) { + if (taddr.address == 0) { + parallelHandle = TValue(context, NULL, + "omptarget_nvptx_threadPrivateContext") + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_levelZeroParallelInfo") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } else { + return ret; + } + } else { + parallelHandle = possibleTaskDescr.access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("enclosed_parallel") + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + } + ret = parallelHandle.getAddress(&taddr); + } else { + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .getAddress(&taddr); + } if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_parallel_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_parallel_handle_t), (void **)(enclosing_parallel_handle)); if (ret != ompd_rc_ok) return ret; @@ -419,6 +625,7 @@ ompd_rc_t ompd_get_task_parallel_handle( (*enclosing_parallel_handle)->ah = task_handle->ah; (*enclosing_parallel_handle)->lwt = task_handle->lwt; (*enclosing_parallel_handle)->th = taddr; + (*enclosing_parallel_handle)->cuda_kernel_info = task_handle->cuda_kernel_info; return ompd_rc_ok; } @@ -427,7 +634,7 @@ ompd_rc_t ompd_release_parallel_handle( ) { if (!parallel_handle) return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(parallel_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(parallel_handle)); if (ret != ompd_rc_ok) return ret; return ompd_rc_ok; @@ -441,38 +648,25 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, return ompd_rc_stale_handle; if (!parallel_handle_2) return ompd_rc_stale_handle; - if (parallel_handle_1->th.address - parallel_handle_2->th.address) + if (parallel_handle_1->ah->kind != parallel_handle_2->ah->kind) + return ompd_rc_bad_input; + if (parallel_handle_1->ah->kind == OMPD_DEVICE_KIND_HOST) { + if (parallel_handle_1->th.address - parallel_handle_2->th.address) + *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; + else + *cmp_value = + parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; + } else { *cmp_value = parallel_handle_1->th.address - parallel_handle_2->th.address; - else - *cmp_value = - parallel_handle_1->lwt.address - parallel_handle_2->lwt.address; + } return ompd_rc_ok; } -#if 0 // parallel-id is initialized to zero -ompd_rc_t ompd_get_parallel_handle_string_id ( - ompd_parallel_handle_t *parallel_handle, - char **string_id - ) -{ - ompd_parallel_id_t id; - ompd_rc_t ret; - ret = ompd_get_parallel_id(parallel_handle, &id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)id); - return ompd_rc_ok; -} -#endif - -/* --- 4.3 Task Handles ----------------------------------------------------- */ +/* --- 4.7 Task Handles ----------------------------------------------------- */ /* task_handle is of type (kmp_taskdata_t) */ -ompd_rc_t ompd_get_current_task__handle( +ompd_rc_t ompd_get_current_task_handle( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ ) { @@ -485,31 +679,38 @@ ompd_rc_t ompd_get_current_task__handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr, lwt; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; + ompd_rc_t ret = ompd_rc_ok; - TValue taskdata = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ - .cast("kmp_taskdata_t", 1); + lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ompd_rc_t ret = taskdata.dereference().getAddress(&taddr); - if (ret != ompd_rc_ok) - return ret; + if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + lwt.address = 0; + taddr = thread_handle->th; + } else { + TValue taskdata = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_current_task") /*__kmp_threads[t]->th.th_current_task*/ + .cast("kmp_taskdata_t", 1); - lwt.segment = OMPD_SEGMENT_UNSPECIFIED; - ret = taskdata - .access("td_team") /*td.td_team*/ - .cast("kmp_team_p", 1) - .access("t") /*td.td_team->t*/ - .cast("kmp_base_team_t", 0) - .access("ompt_serialized_team_info") - .castBase() - .getValue(lwt.address); + ret = taskdata.dereference().getAddress(&taddr); + if (ret != ompd_rc_ok) + return ret; + + ret = taskdata + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .cast("kmp_base_team_t", 0) + .access("ompt_serialized_team_info") + .castBase() + .getValue(lwt.address); + } if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(task_handle)); if (ret != ompd_rc_ok) return ret; @@ -517,13 +718,19 @@ ompd_rc_t ompd_get_current_task__handle( (*task_handle)->th = taddr; (*task_handle)->lwt = lwt; (*task_handle)->ah = thread_handle->ah; + (*task_handle)->cuda_kernel_info = thread_handle->cuda_kernel_info; return ompd_rc_ok; } -ompd_rc_t ompd_get_generating_ancestor_task_handle( +ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ) { + // Generating and Scheduling task are the same on cuda? + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + return ompd_get_scheduling_task_handle(task_handle, parent_task_handle); + } + if (!task_handle) return ompd_rc_stale_handle; if (!task_handle->ah) @@ -533,7 +740,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr = task_handle->th, lwt; + ompd_address_t taddr = task_handle->th, lwt={OMPD_SEGMENT_UNSPECIFIED,0}; ompd_rc_t ret = ompd_rc_stale_handle; TValue lwtValue = TValue(context, task_handle->lwt); @@ -569,7 +776,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle( return ret; } - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(parent_task_handle)); if (ret != ompd_rc_ok) return ret; @@ -580,7 +787,7 @@ ompd_rc_t ompd_get_generating_ancestor_task_handle( return ret; } -ompd_rc_t ompd_get_scheduling_ancestor_task_handle( +ompd_rc_t ompd_get_scheduling_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ) { @@ -593,27 +800,47 @@ ompd_rc_t ompd_get_scheduling_ancestor_task_handle( return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; + ompd_rc_t ret; - ompd_rc_t ret = - TValue(context, task_handle->th) - .cast("kmp_taskdata_t") /*td*/ - .access("ompt_task_info") // td->ompt_task_info - .cast("ompt_task_info_t") - .access("scheduling_parent") // td->ompd_task_info.scheduling_parent - .cast("kmp_taskdata_t", 1) - .dereference() - .getAddress(&taddr); + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + ret = TValue(context, task_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("prev") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .dereference() + .getAddress(&taddr); + if (taddr.address == 0) { + return ompd_rc_unavailable; + } + } else { + ret = + TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("ompt_task_info") // td->ompt_task_info + .cast("ompt_task_info_t") + .access("scheduling_parent") // td->ompd_task_info.scheduling_parent + .cast("kmp_taskdata_t", 1) + .castBase() + .getValue(taddr.address); + if (taddr.address == 0) { + return ompd_rc_unavailable; + } + } if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(parent_task_handle)); if (ret != ompd_rc_ok) return ret; (*parent_task_handle)->th = taddr; + (*parent_task_handle)->lwt = {OMPD_SEGMENT_UNSPECIFIED,0}; (*parent_task_handle)->ah = task_handle->ah; + (*parent_task_handle)->cuda_kernel_info = task_handle->cuda_kernel_info; return ret; } @@ -634,24 +861,38 @@ ompd_rc_t ompd_get_task_in_parallel( assert(callbacks && "Callback table not initialized!"); ompd_rc_t ret; - ompd_address_t taddr; - ret = TValue(context, parallel_handle->th) /* t */ - .cast("kmp_base_team_t", 0) - .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ - .cast("kmp_taskdata_t", 1) - .getArrayElement( - nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ - .getAddress(&taddr); + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; + + if (parallel_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + ret = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_paralel_info", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getArrayElement(nth_handle) + .getAddress(&taddr); + } else { + ret = TValue(context, parallel_handle->th) /* t */ + .cast("kmp_base_team_t", 0) + .access("t_implicit_task_taskdata") /*t.t_implicit_task_taskdata*/ + .cast("kmp_taskdata_t", 1) + .getArrayElement( + nth_handle) /*t.t_implicit_task_taskdata[nth_handle]*/ + .getAddress(&taddr); + } if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_task_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_task_handle_t), (void **)(task_handle)); if (ret != ompd_rc_ok) return ret; (*task_handle)->th = taddr; (*task_handle)->ah = parallel_handle->ah; + (*task_handle)->lwt = {OMPD_SEGMENT_UNSPECIFIED,0}; + (*task_handle)->cuda_kernel_info = parallel_handle->cuda_kernel_info; return ret; } @@ -660,7 +901,7 @@ ompd_rc_t ompd_release_task_handle( ) { if (!task_handle) return ompd_rc_stale_handle; - ompd_rc_t ret = callbacks->dmemory_free((void *)(task_handle)); + ompd_rc_t ret = callbacks->memory_free((void *)(task_handle)); if (ret != ompd_rc_ok) return ret; return ompd_rc_ok; @@ -673,38 +914,26 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, return ompd_rc_stale_handle; if (!task_handle_2) return ompd_rc_stale_handle; - if (task_handle_1->th.address - task_handle_2->th.address) + if (task_handle_1->ah->kind != task_handle_2->ah->kind) + return ompd_rc_bad_input; + if (task_handle_1->th.address - task_handle_2->th.address || + task_handle_1->ah->kind == OMPD_DEVICE_KIND_CUDA) *cmp_value = task_handle_1->th.address - task_handle_2->th.address; else *cmp_value = task_handle_1->lwt.address - task_handle_2->lwt.address; return ompd_rc_ok; } -#if 0 // all task ids are initialized to zero -ompd_rc_t ompd_get_task_handle_string_id ( - ompd_task_handle_t *task_handle, - char **string_id - ) -{ - ompd_task_id_t id; - ompd_rc_t ret = ompd_get_task_id(task_handle, &id); - if (ret!=ompd_rc_ok) - return ret; - ret = callbacks->dmemory_alloc(sizeof(void*)*2+3, (void**)string_id); - if (ret!=ompd_rc_ok) - return ret; - sprintf(*string_id, "0x%llx", (long long)id); - return ompd_rc_ok; -} -#endif +/* --- 7 Thread Inquiry ----------------------------------------------------- */ -/* --- 5 Process and Thread Settings ---------------------------------------- */ +/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ ompd_rc_t -ompd_get_num_procs(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ) { +ompd_get_thread_handle(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, const void *thread_id, + ompd_thread_handle_t **thread_handle) { if (!addr_handle) return ompd_rc_stale_handle; ompd_address_space_context_t *context = addr_handle->context; @@ -714,237 +943,91 @@ ompd_get_num_procs(ompd_address_space_handle_t return ompd_rc_stale_handle; assert(callbacks && "Callback table not initialized!"); + ompd_thread_context_t *tcontext; + ret = callbacks->get_thread_context_for_thread_id( + context, kind, sizeof_thread_id, thread_id, &tcontext); + if (ret != ompd_rc_ok) + return ret; - int nth; - ret = TValue(context, "__kmp_avail_proc") - .castBase("__kmp_avail_proc") - .getValue(nth); - *val = nth; - return ret; -} - -ompd_rc_t -ompd_get_thread_limit(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; + int tId; - if (!context) - return ompd_rc_stale_handle; + if (kind == OMPD_THREAD_ID_CUDALOGICAL) { + ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; - assert(callbacks && "Callback table not initialized!"); + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x] + TValue th = TValue(context, tcontext, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("topTaskDescr") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .getPtrArrayElement(p->threadIdx.x) + .dereference(); + + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; + ret = th.getAddress(&taddr); - int nth; - ret = - TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); - *val = nth; - return ret; -} + if (ret != ompd_rc_ok) { + if (taddr.address == 0 && p->threadIdx.x % 32 == 0) { + // check for the master task/thread instead + // The master thread should never have the threadIdx.x of zero, so + // checking it this way should be safe -/* --- 6 Parallel Region Inqueries ------------------------------------------ */ -/* --- 6.1 Settings --------------------------------------------------------- */ + th = TValue(context, tcontext, + "omptarget_nvptx_threadPrivateContext", + OMPD_SEGMENT_CUDA_PTX_SHARED) + .cast("omptarget_nvptx_ThreadPrivateContext", 1, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("teamContext") + .cast("omptarget_nvptx_TeamDescr", 0, + OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("levelZeroTaskDescr"); -ompd_rc_t ompd_get_num_threads( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + ret = th.getAddress(&taddr); - assert(callbacks && "Callback table not initialized!"); + if (ret != ompd_rc_ok) + return ret; + } else { + return ret; + } + } - ompd_rc_t ret = ompd_rc_ok; - if (parallel_handle->lwt.address != 0) - *val = 1; - else { - uint32_t res; - ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_nproc") /*t.t_nproc*/ - .castBase() - .getValue(res); - *val = res; - } - return ret; -} + // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x] + // ->ompd_thread_info.threadIdx_x + ret = th.cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("threadIdx_x") + .castBase(ompd_type_short) + .getValue(tId); -ompd_rc_t ompd_get_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + if (ret != ompd_rc_ok) + return ret; - assert(callbacks && "Callback table not initialized!"); + if (tId != p->threadIdx.x) { + return ompd_rc_stale_handle; + } - uint32_t res; + // allocate both the thread handle and the cuda kernel info in one go + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t) + + sizeof(ompd_cuda_thread_kernel_info_t), + (void **)(thread_handle)); + if (ret != ompd_rc_ok) + return ret; - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_level") /*t.t_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; -} - -ompd_rc_t ompd_get_active_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - uint32_t res; - - ompd_rc_t ret = TValue(context, parallel_handle->th) - .cast("kmp_base_team_t", 0) /*t*/ - .access("t_active_level") /*t.t_active_level*/ - .castBase() - .getValue(res); - *val = res; - return ret; -} - -/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ - -ompd_rc_t ompd_get_parallel_data( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *data /* OUT: OpenMP parallel id */ - ) { - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - if (!ompd_state) - return ompd_rc_needs_state_tracking; - - assert(callbacks && "Callback table not initialized!"); - - TValue teamInfo; - if (parallel_handle->lwt.address != 0) - teamInfo = TValue(context, parallel_handle->lwt) - .cast("ompt_lw_taskteam_t", 0); /*lwt*/ - else - teamInfo = - TValue(context, parallel_handle->th).cast("kmp_base_team_t", 0); /*t*/ - ompd_rc_t ret = teamInfo - .access("ompt_team_info") /*t.ompt_team_info*/ - .cast("ompt_team_info_t", 0) - .access("parallel_data") /*t.ompt_team_info.parallel_id*/ - .getAddress(data); - return ret; -} - -#if 0 // there is no such thing as a parallel function -ompd_rc_t ompd_get_parallel_function( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */ - ) -{ - if (!parallel_handle) - return ompd_rc_stale_handle; - if (!parallel_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = parallel_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - if (!ompd_state) - return ompd_rc_needs_state_tracking; - - assert(callbacks && "Callback table not initialized!"); - parallel_addr->segment = OMPD_SEGMENT_UNSPECIFIED; - - TValue teamInfo; - if(parallel_handle->lwt.address!=0) - teamInfo = TValue(context, parallel_handle->lwt). - cast("ompt_lw_taskteam_t",0); /*lwt*/ - else - teamInfo = TValue(context, parallel_handle->th). - cast("kmp_base_team_t",0); /*t*/ - ompd_rc_t ret = teamInfo. - access("ompt_team_info"). /*t.ompt_team_info*/ - cast("ompt_team_info_t",0). - access("microtask"). /*t.ompt_team_info.microtask*/ - castBase(). - getValue(parallel_addr->address); - return ret; -} -#endif // no parallel function - -/* --- 7 Thread Inquiry ----------------------------------------------------- */ - -/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ - -ompd_rc_t -ompd_get_thread_handle(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, - ompd_size_t sizeof_thread_id, const void *thread_id, - ompd_thread_handle_t **thread_handle) { - if (!addr_handle) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = addr_handle->context; - ompd_rc_t ret; - - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - ompd_thread_context_t *tcontext; - ret = callbacks->get_thread_context_for_thread_id( - context, kind, sizeof_thread_id, thread_id, &tcontext); - if (ret != ompd_rc_ok) - return ret; - - int tId; - - if (kind == ompd_thread_id_cudalogical) { - ompd_cudathread_coord_t *p = (ompd_cudathread_coord_t *)thread_id; - - // omptarget_nvptx_threadPrivateContext->topTaskDescr[p->threadIdx.x]->data.items.threadId - - ret = - TValue(context, tcontext, "omptarget_nvptx_threadPrivateContext", - OMPD_SEGMENT_CUDA_PTX_SHARED) - .cast("omptarget_nvptx_ThreadPrivateContext", 1, - OMPD_SEGMENT_CUDA_PTX_SHARED) - .access("topTaskDescr") - .cast("omptarget_nvptx_TaskDescr", 1, OMPD_SEGMENT_CUDA_PTX_GLOBAL) - .getArrayElement(p->threadIdx.x) - .access("data__items__threadId") - .castBase(ompd_type_short) - .getValue(tId); - - if (ret != ompd_rc_ok) - return ret; - - if (tId != p->threadIdx.x) - return ompd_rc_stale_handle; + (*thread_handle)->ah = addr_handle; + (*thread_handle)->th = taddr; + (*thread_handle)->cuda_kernel_info = + (ompd_cuda_thread_kernel_info_t*)((*thread_handle) + 1); + + (*thread_handle)->cuda_kernel_info->cudaDevId = p->cudaDevId; + (*thread_handle)->cuda_kernel_info->cudaContext = p->cudaContext; + (*thread_handle)->cuda_kernel_info->warpSize = p->warpSize; + (*thread_handle)->cuda_kernel_info->gridId = p->gridId; + (*thread_handle)->cuda_kernel_info->gridDim = p->gridDim; + (*thread_handle)->cuda_kernel_info->blockDim = p->blockDim; } else { ret = TValue(context, tcontext, "__kmp_gtid") .castBase("__kmp_gtid") @@ -960,16 +1043,17 @@ ompd_get_thread_handle(ompd_address_space_handle_t .getArrayElement(tId) /*__kmp_threads[t]*/ .access("th"); /*__kmp_threads[t]->th*/ - ompd_address_t taddr; + ompd_address_t taddr={OMPD_SEGMENT_UNSPECIFIED,0}; ret = th.getAddress(&taddr); if (ret != ompd_rc_ok) return ret; - ret = callbacks->dmemory_alloc(sizeof(ompd_thread_handle_t), + ret = callbacks->memory_alloc(sizeof(ompd_thread_handle_t), (void **)(thread_handle)); if (ret != ompd_rc_ok) return ret; (*thread_handle)->ah = addr_handle; (*thread_handle)->th = taddr; + (*thread_handle)->cuda_kernel_info = NULL; #ifndef NDEBUG if (ret != ompd_rc_ok) @@ -990,13 +1074,14 @@ ompd_get_thread_handle(ompd_address_space_handle_t "Callback table not initialized!"); #endif } + (*thread_handle)->thread_context = tcontext; return ret; } ompd_rc_t ompd_get_thread_id( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id) { - if (kind != ompd_thread_id_pthread) + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id) { + if (kind != OMPD_THREAD_ID_PTHREAD && kind != OMPD_THREAD_ID_CUDALOGICAL) return ompd_rc_bad_input; if (!thread_handle) return ompd_rc_stale_handle; @@ -1005,52 +1090,68 @@ ompd_rc_t ompd_get_thread_id( ompd_address_space_context_t *context = thread_handle->ah->context; if (!context) return ompd_rc_stale_handle; - ompd_size_t size; - ompd_rc_t ret = tf.getType(context, "kmp_thread_t").getSize(&size); - if (ret != ompd_rc_ok) - return ret; - if (sizeof_thread_id != size) - return ompd_rc_bad_input; + ompd_rc_t ret; - assert(callbacks && "Callback table not initialized!"); + if (kind == OMPD_THREAD_ID_CUDALOGICAL) { + if (sizeof_thread_id != sizeof(ompd_cudathread_coord_t)) { + return ompd_rc_bad_input; + } + ompd_cudathread_coord_t *cuda_thread_id = + (ompd_cudathread_coord_t*)thread_id; + cuda_thread_id->cudaDevId = thread_handle->cuda_kernel_info->cudaDevId; + cuda_thread_id->cudaContext = thread_handle->cuda_kernel_info->cudaContext; + cuda_thread_id->warpSize = thread_handle->cuda_kernel_info->warpSize; + cuda_thread_id->gridId = thread_handle->cuda_kernel_info->gridId; + + auto threadInfo = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL); + + ret = threadInfo.access("threadIdx_x") + .castBase() + .getValue(cuda_thread_id->threadIdx.x); - ret = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_info") /*__kmp_threads[t]->th.th_info*/ - .cast("kmp_desc_t") - .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ - .cast("kmp_desc_base_t") - .access("ds_thread") /*__kmp_threads[t]->th.th_info.ds.ds_thread*/ - .cast("kmp_thread_t") - .getRawValue(thread_id, 1); - return ret; -} + if (ret != ompd_rc_ok) + return ret; -ompd_rc_t ompd_get_thread_num( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *val /* OUT: number of the thread within the team */ - ) { - // __kmp_threads[8]->th.th_info.ds.ds_tid - if (!thread_handle) - return ompd_rc_stale_handle; - if (!thread_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = thread_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; + cuda_thread_id->threadIdx.y = cuda_thread_id->threadIdx.z = 0; - assert(callbacks && "Callback table not initialized!"); + ret = threadInfo.access("blockIdx_x") + .castBase() + .getValue(cuda_thread_id->blockIdx.x); - ompd_rc_t ret = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("th_info") /*__kmp_threads[t]->th.th_info*/ - .cast("kmp_desc_t") - .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ - .cast("kmp_desc_base_t") - .access("ds_tid") /*__kmp_threads[t]->th.th_info.ds.ds_tid*/ - .castBase() - .getValue(*val); + if (ret != ompd_rc_ok) + return ret; + + cuda_thread_id->blockIdx.y = cuda_thread_id->blockIdx.z = 0; + + cuda_thread_id->gridDim = thread_handle->cuda_kernel_info->gridDim; + cuda_thread_id->blockDim = thread_handle->cuda_kernel_info->blockDim; + + return ompd_rc_ok; + } else { + ompd_size_t size; + ret = tf.getType(context, "kmp_thread_t").getSize(&size); + if (ret != ompd_rc_ok) + return ret; + if (sizeof_thread_id != size) + return ompd_rc_bad_input; + + assert(callbacks && "Callback table not initialized!"); + + ret = TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_info") /*__kmp_threads[t]->th.th_info*/ + .cast("kmp_desc_t") + .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ + .cast("kmp_desc_base_t") + .access("ds_thread") /*__kmp_threads[t]->th.th_info.ds.ds_thread*/ + .cast("kmp_thread_t") + .getRawValue(thread_id, 1); + } return ret; } @@ -1071,26 +1172,38 @@ ompd_rc_t ompd_get_state( if (!ompd_state) return ompd_rc_needs_state_tracking; + ompd_rc_t ret; assert(callbacks && "Callback table not initialized!"); - TValue ompt_thread_info = - TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ - .cast("kmp_base_info_t") - .access("ompt_thread_info") /*__kmp_threads[t]->th.ompt_thread_info*/ - .cast("ompt_thread_info_t"); - if (ompt_thread_info.gotError()) - return ompt_thread_info.getError(); - ompd_rc_t ret = - ompt_thread_info - .access("state") /*__kmp_threads[t]->th.ompt_thread_info.state*/ - .castBase() - .getValue(*state); - if (ret != ompd_rc_ok) - return ret; - ret = ompt_thread_info - .access("wait_id") /*__kmp_threads[t]->th.ompt_thread_info.state*/ + if (thread_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + if (wait_id) + *wait_id = 0; + ret = TValue(context, thread_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, OMPD_SEGMENT_CUDA_PTX_SHARED) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("state") + .castBase(ompd_type_long_long) + .getValue(*state); + } else { + TValue ompt_thread_info = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("ompt_thread_info") /*__kmp_threads[t]->th.ompt_thread_info*/ + .cast("ompt_thread_info_t"); + if (ompt_thread_info.gotError()) + return ompt_thread_info.getError(); + ret = ompt_thread_info + .access("state") /*__kmp_threads[t]->th.ompt_thread_info.state*/ .castBase() - .getValue(*wait_id); + .getValue(*state); + if (ret != ompd_rc_ok) + return ret; + ret = ompt_thread_info + .access("wait_id") /*__kmp_threads[t]->th.ompt_thread_info.state*/ + .castBase() + .getValue(*wait_id); + } return ret; } @@ -1098,244 +1211,6 @@ ompd_rc_t ompd_get_state( /* --- 8.1 Task Settings ---------------------------------------------------- */ -ompd_rc_t ompd_get_max_threads( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("nproc") // td->td_icvs.dynamic - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t ompd_in_parallel( // Why do we need a task context for _in_parallel? - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - ompd_rc_t ret; - - assert(callbacks && "Callback table not initialized!"); - - ret = TValue(context, "__kmp_root") // __kmp_root - .cast("kmp_root_t", 2) - .dereference() // (*__kmp_root) - .access("r") // (*__kmp_root)->r - .cast("kmp_base_root_t") - .access("r_in_parallel") // (*__kmp_root)->r.r_in_parallel - .castBase() - .getValue(*val); - if (ret != ompd_rc_ok) - return ret; - if (*val) - *val = 1; - - return ret; -} - -ompd_rc_t -ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_flags") // td->td_icvs - .cast("kmp_tasking_flags_t") - .check("final", val); // td->td_icvs.max_active_levels - - return ret; -} - -ompd_rc_t -ompd_get_dynamic(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("dynamic") // td->td_icvs.dynamic - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t -ompd_get_nested(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("nested") // td->td_icvs.nested - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t ompd_get_max_active_levels( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = - TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("max_active_levels") // td->td_icvs.max_active_levels - .castBase() - .getValue(*val); - - return ret; -} - -ompd_rc_t -ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *kind, /* OUT: Kind of OpenMP schedule*/ - ompd_word_t *modifier /* OUT: Schedunling modifier */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - TValue sched = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("sched") // td->td_icvs.sched - .cast("kmp_r_sched_t", 0); - - ompd_rc_t ret = sched - .access("r_sched_type") // td->td_icvs.sched.r_sched_type - .castBase() - .getValue(*kind); - if (ret != ompd_rc_ok) - return ret; - ret = sched - .access("chunk") // td->td_icvs.sched.r_sched_type - .castBase() - .getValue(*modifier); - return ret; -} - -ompd_rc_t -ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *bind /* OUT: Kind of proc-binding */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_icvs") // td->td_icvs - .cast("kmp_internal_control_t", 0) - .access("proc_bind") // td->td_icvs.proc_bind - .castBase() - .getValue(*bind); - - return ret; -} - -ompd_rc_t -ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - - assert(callbacks && "Callback table not initialized!"); - - ompd_rc_t ret = TValue(context, task_handle->th) - .cast("kmp_taskdata_t") // td - .access("td_flags") // td->td_flags - .cast("kmp_tasking_flags_t") - .check("tasktype", val); // td->td_flags.tasktype - *val ^= 1; // tasktype: explicit = 1, implicit = 0 => invert the value - return ret; -} - /* --- 8.2 OMPT Task Inquiry Analogues -------------------------------------- */ ompd_rc_t ompd_get_task_frame( @@ -1365,7 +1240,7 @@ ompd_rc_t ompd_get_task_frame( .access("ompt_task_info") // td->ompt_task_info .cast("ompt_task_info_t") .access("frame") // td->ompd_task_info.frame - .cast("ompt_frame_t", 0); + .cast("omp_frame_t", 0); sp_reentry->segment = OMPD_SEGMENT_UNSPECIFIED; ompd_rc_t ret = frame @@ -1385,38 +1260,6 @@ ompd_rc_t ompd_get_task_frame( return ret; } -ompd_rc_t -ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_address_t *task_data /* OUT: OpenMP task ID */ - ) { - if (!task_handle) - return ompd_rc_stale_handle; - if (!task_handle->ah) - return ompd_rc_stale_handle; - ompd_address_space_context_t *context = task_handle->ah->context; - if (!context) - return ompd_rc_stale_handle; - if (!ompd_state) - return ompd_rc_needs_state_tracking; - - assert(callbacks && "Callback table not initialized!"); - - TValue taskInfo; - if (task_handle->lwt.address != 0) - taskInfo = - TValue(context, task_handle->lwt).cast("ompt_lw_taskteam_t", 0); /*lwt*/ - else - taskInfo = TValue(context, task_handle->th).cast("kmp_taskdata_t", 0); /*t*/ - ompd_rc_t ret = taskInfo - .access("ompt_task_info") // td->ompt_task_info - .cast("ompt_task_info_t") - .access("task_data") // td->ompt_task_info.task_data - .getAddress(task_data); - - return ret; -} - -#if 1 // the runtime currently does not have task function information ompd_rc_t ompd_get_task_function( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_address_t *task_addr /* OUT: first instruction in the task region */ @@ -1433,35 +1276,66 @@ ompd_rc_t ompd_get_task_function( return ompd_rc_needs_state_tracking; assert(callbacks && "Callback table not initialized!"); - -#if 0 - /* We don't have a task function for implicit tasks */ - ompd_word_t implicit; - ompd_rc_t ret = ompd_is_implicit (task_handle, &implicit); - if (ret != ompd_rc_ok) - return ret; - if (implicit) - return ompd_rc_bad_input; -#else ompd_rc_t ret; -#endif - task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; - TValue taskInfo; - if(task_handle->lwt.address!=0) - return ompd_rc_bad_input; // We need to decide what we do here. - else - ret = TValue(context, task_handle->th). - cast("kmp_taskdata_t",0). /*t*/ - getArrayElement(1). /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ - cast("kmp_task_t",0). /* (kmp_task_t *) */ - access("routine"). /*td->ompt_task_info*/ - castBase(). - getValue(task_addr->address); + + if (task_handle->ah->kind == OMPD_DEVICE_KIND_CUDA) { + task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; + ret = TValue(context, task_handle->th) + .cast("omptarget_nvptx_TaskDescr", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("ompd_thread_info") + .cast("ompd_nvptx_thread_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("task_function") + .castBase() + .getValue(task_addr->address); + + } else { + task_addr->segment = OMPD_SEGMENT_UNSPECIFIED; + TValue taskInfo; + if(task_handle->lwt.address!=0) + return ompd_rc_bad_input; // We need to decide what we do here. + else + { + ompd_word_t val; + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_flags") // td->td_flags + .cast("kmp_tasking_flags_t") + .check("tasktype", &val); // td->td_flags.tasktype + + if (ret != ompd_rc_ok) + return ret; + + if (val==1) { // tasktype: explicit = 1, implicit = 0 + + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t",0) /*t*/ + .getArrayElement(1) /* see kmp.h: #define KMP_TASKDATA_TO_TASK(taskdata) (kmp_task_t *)(taskdata + 1) */ + .cast("kmp_task_t",0) /* (kmp_task_t *) */ + .access("routine") /*td->ompt_task_info*/ + .castBase() + .getValue(task_addr->address); + + } else { + + ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") /*td*/ + .access("td_team") /*td.td_team*/ + .cast("kmp_team_p", 1) + .access("t") /*td.td_team->t*/ + .cast("kmp_base_team_t", 0) + .access("t_pkfn") /*td.td_team->t.t_pkfn*/ + .castBase() + .getValue(task_addr->address); + + } + } + } return ret; } -#endif -/* --- 9 OMPD Version and Compatibility Information ------------------------- */ +/* --- --- OMPD Version and Compatibility Information ----------------------- */ ompd_rc_t ompd_get_api_version(ompd_word_t *version) { *version = OMPD_VERSION; @@ -1479,7 +1353,7 @@ ompd_get_api_version_string(const char **string /* OUT: OMPD version string */ return ompd_rc_ok; } -/* --- 12 Display Control Variables ----------------------------------------- */ +/* --- 4.8 Display Control Variables ---------------------------------------- */ ompd_rc_t ompd_get_display_control_vars(ompd_address_space_handle_t *handle, @@ -1501,12 +1375,12 @@ ompd_rc_t initTypeSizes(ompd_address_space_context_t *context) { static ompd_rc_t ret; if (inited) return ret; - ret = callbacks->tsizeof_prim(context, &type_sizes); + ret = callbacks->sizeof_types(context, &type_sizes); if (ret != ompd_rc_ok) return ret; if (!(type_sizes.sizeof_pointer > 0)) return ompd_rc_error; - ret = callbacks->tsizeof_prim(context, &TValue::type_sizes); + ret = callbacks->sizeof_types(context, &TValue::type_sizes); if (ret != ompd_rc_ok) return ret; inited = 1; diff --git a/libompd/src/omp-debug.h b/libompd/src/omp-debug.h index a4cd8f785..81b652dab 100644 --- a/libompd/src/omp-debug.h +++ b/libompd/src/omp-debug.h @@ -13,17 +13,16 @@ #ifdef __cplusplus #include -#include #define OMPD_DLL_VERSION 201811; extern "C" { #endif -#define OMPD_IMPLEMENTS_OPENMP 3 -#define OMPD_IMPLEMENTS_OPENMP_SUBVERSION 1 +#define OMPD_IMPLEMENTS_OPENMP 5 +#define OMPD_IMPLEMENTS_OPENMP_SUBVERSION 0 #define OMPD_TR_VERSION 6 -#define OMPD_TR_SUBVERSION 'j' +#define OMPD_TR_SUBVERSION 2 #define OMPD_VERSION \ (OMPD_IMPLEMENTS_OPENMP << 24) + (OMPD_IMPLEMENTS_OPENMP_SUBVERSION << 16) + \ (OMPD_TR_VERSION << 8) + OMPD_TR_SUBVERSION @@ -32,79 +31,78 @@ extern "C" { #define STR(x) STR_HELPER(x) #include "ompd.h" +#include "ompd-types.h" /****************************************************************************** * General helper functions - */ -ompd_rc_t initTypeSizes(ompd_address_space_context_t *context); + */ + ompd_rc_t initTypeSizes(ompd_address_space_context_t *context); #ifdef __cplusplus -} - -static const ompd_callbacks_t *callbacks = NULL; - -class ompdAllocatable { -public: - static void *operator new(std::size_t sz) { - void *res; - ompd_rc_t ret = callbacks->dmemory_alloc(sz, &res); - if (ret == ompd_rc_ok) - return res; - throw std::bad_alloc(); - } - static void *operator new[](std::size_t sz) { - void *res; - ompd_rc_t ret = callbacks->dmemory_alloc(sz, &res); - if (ret == ompd_rc_ok) - return res; - throw std::bad_alloc(); - } - void operator delete(void *addr) throw() { - ompd_rc_t ret = callbacks->dmemory_free(addr); - if (ret != ompd_rc_ok) - throw std::bad_alloc(); - } - void operator delete[](void *addr) throw() { - ompd_rc_t ret = callbacks->dmemory_free(addr); - if (ret != ompd_rc_ok) - throw std::bad_alloc(); } -}; -typedef struct _ompd_address_space_context_s ompd_address_space_context_t; -typedef struct _ompd_process_handle_s : public ompdAllocatable { - ompd_address_space_context_t *context; -} ompd_process_handle_t; +static const ompd_callbacks_t *callbacks = nullptr; + + +// Information shared by all threads in a kernel +// Used to map thread handles to native cuda thread ids +typedef struct _ompd_cuda_thread_kernel_info_s { + ompd_addr_t cudaDevId; + ompd_addr_t cudaContext; + ompd_addr_t warpSize; + ompd_addr_t gridId; + ompd_dim3_t gridDim; + ompd_dim3_t blockDim; +} ompd_cuda_thread_kernel_info_t; -typedef struct _ompd_address_space_handle_s : public ompdAllocatable { +typedef struct _ompd_address_space_context_s ompd_address_space_context_t; + +typedef struct _ompd_address_space_handle_s { ompd_address_space_context_t *context; - ompd_device_kind_t kind; - ompd_device_identifier_t id; + ompd_device_t kind; + uint64_t id; } ompd_address_space_handle_t; -typedef struct _ompd_device_handle_s : public ompdAllocatable { - ompd_address_space_handle_t *ah; - ompd_address_t th; /* target handle */ -} ompd_device_handle_t; - -typedef struct _ompd_thread_handle_s : public ompdAllocatable { +typedef struct _ompd_thread_handle_s { ompd_address_space_handle_t *ah; + ompd_thread_context_t *thread_context; ompd_address_t th; /* target handle */ + ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* only valid for cuda */ } ompd_thread_handle_t; -typedef struct _ompd_parallel_handle_s : public ompdAllocatable { +typedef struct _ompd_parallel_handle_s { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ ompd_address_t lwt; /* lwt handle */ + ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* copied from the thread + used to retrieve this + parallel region handle + */ } ompd_parallel_handle_t; -typedef struct _ompd_task_handle_s : public ompdAllocatable { +typedef struct _ompd_task_handle_s { ompd_address_space_handle_t *ah; ompd_address_t th; /* target handle */ ompd_address_t lwt; /* lwt handle */ + ompd_cuda_thread_kernel_info_t *cuda_kernel_info; /* copied from the thread + used to retrieve this + parallel region handle + */ + _ompd_task_handle_s(){ + ah=NULL; + th.segment=OMPD_SEGMENT_UNSPECIFIED; + lwt.segment=OMPD_SEGMENT_UNSPECIFIED; + th.address=0; + lwt.address=0; + cuda_kernel_info=NULL; + } } ompd_task_handle_t; #endif +// TODO (mr) this is ugly, but better then a global symbol (?) +void __ompd_init_icvs(const ompd_callbacks_t *table); +void __ompd_init_states(const ompd_callbacks_t *table); + #endif /* SRC_OMP_DEBUG_H_ */ diff --git a/libompd/src/omp-icv.cpp b/libompd/src/omp-icv.cpp new file mode 100644 index 000000000..72598ad09 --- /dev/null +++ b/libompd/src/omp-icv.cpp @@ -0,0 +1,512 @@ +#include "omp-debug.h" +#include "ompd-private.h" +#include "TargetValue.h" + +#define FOREACH_OMPD_ICV(macro) \ + macro (levels_var, "levels-var", ompd_scope_parallel, 1) \ + macro (active_levels_var, "active-levels-var", ompd_scope_parallel, 0) \ + macro (thread_limit_var, "thread-limit-var", ompd_scope_address_space, 0) \ + macro (max_active_levels_var, "max-active-levels-var", ompd_scope_task, 0) \ + macro (bind_var, "bind-var", ompd_scope_task, 0) \ + macro (num_procs_var, "ompd-num-procs-var", ompd_scope_address_space, 0) \ + macro (thread_num_var, "ompd-thread-num-var", ompd_scope_thread, 1) \ + macro (final_var, "ompd-final-var", ompd_scope_task, 0) \ + macro (implicit_var, "ompd-implicit-var", ompd_scope_task, 0) \ + macro (team_size_var, "ompd-team-size-var", ompd_scope_parallel, 1) \ + +void __ompd_init_icvs(const ompd_callbacks_t *table) { + callbacks = table; +} + +enum ompd_icv { + ompd_icv_undefined_marker = 0, // ompd_icv_undefined is already defined in ompd.h +#define ompd_icv_macro(v, n, s, d) ompd_icv_ ## v, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro + ompd_icv_after_last_icv +}; + +static const char *ompd_icv_string_values[] = { + "undefined", +#define ompd_icv_macro(v, n, s, d) n, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro +}; + +static const ompd_scope_t ompd_icv_scope_values[] = { + ompd_scope_global, // undefined marker +#define ompd_icv_macro(v, n, s, d) s, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro +}; + +static const uint8_t ompd_icv_available_cuda[] = { + 1, // undefined marker +#define ompd_icv_macro(v, n, s, d) d, + FOREACH_OMPD_ICV(ompd_icv_macro) +#undef ompd_icv_macro + 1, // icv after last icv marker +}; + + +static ompd_rc_t ompd_enumerate_icvs_cuda(ompd_icv_id_t current, + ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, + int *more) { + int next_possible_icv = current; + do { + next_possible_icv++; + } while (!ompd_icv_available_cuda[next_possible_icv]); + + if (next_possible_icv >= ompd_icv_after_last_icv) { + return ompd_rc_bad_input; + } + + *next_id = next_possible_icv; + *next_icv_name = ompd_icv_string_values[*next_id]; + *next_scope = ompd_icv_scope_values[*next_id]; + + do { + next_possible_icv++; + } while (!ompd_icv_available_cuda[next_possible_icv]); + + if (next_possible_icv >= ompd_icv_after_last_icv) { + *more = 0; + } else { + *more = 1; + } + return ompd_rc_ok; +} + +ompd_rc_t ompd_enumerate_icvs(ompd_address_space_handle_t *handle, + ompd_icv_id_t current, ompd_icv_id_t *next_id, + const char **next_icv_name, + ompd_scope_t *next_scope, + int *more) { + if (!handle) { + return ompd_rc_stale_handle; + } + if (handle->kind == OMPD_DEVICE_KIND_CUDA) { + return ompd_enumerate_icvs_cuda(current, next_id, next_icv_name, + next_scope, more); + } + if (current + 1 >= ompd_icv_after_last_icv) { + return ompd_rc_bad_input; + } + + *next_id = current + 1; + *next_icv_name = ompd_icv_string_values[*next_id]; + *next_scope = ompd_icv_scope_values[*next_id]; + + if ((*next_id) + 1 >= ompd_icv_after_last_icv) { + *more = 0; + } else { + *more = 1; + } + + return ompd_rc_ok; +} + + +static ompd_rc_t ompd_get_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: nesting level */ + ) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + uint32_t res; + + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_level") /*t.t_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; +} + + +static ompd_rc_t ompd_get_level_cuda( + ompd_parallel_handle_t *parallel_handle, + ompd_word_t *val) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized"); + + uint16_t res; + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("level") + .castBase(ompd_type_short) + .getValue(res); + *val = res; + return ret; +} + + +static ompd_rc_t ompd_get_active_level( + ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: active nesting level */ + ) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + uint32_t res; + + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_active_level") /*t.t_active_level*/ + .castBase() + .getValue(res); + *val = res; + return ret; +} + + +static ompd_rc_t +ompd_get_num_procs(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: number of processes */ + ) { + ompd_address_space_context_t *context = addr_handle->context; + if (!context) + return ompd_rc_stale_handle; + ompd_rc_t ret; + + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + int nth; + ret = TValue(context, "__kmp_avail_proc") + .castBase("__kmp_avail_proc") + .getValue(nth); + *val = nth; + return ret; +} + +static ompd_rc_t +ompd_get_thread_limit(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!addr_handle) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = addr_handle->context; + ompd_rc_t ret; + + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + int nth; + ret = + TValue(context, "__kmp_max_nth").castBase("__kmp_max_nth").getValue(nth); + *val = nth; + return ret; +} + +static ompd_rc_t ompd_get_thread_num( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *val /* OUT: number of the thread within the team */ + ) { + // __kmp_threads[8]->th.th_info.ds.ds_tid + if (!thread_handle) + return ompd_rc_stale_handle; + if (!thread_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = thread_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = + TValue(context, thread_handle->th) /*__kmp_threads[t]->th*/ + .cast("kmp_base_info_t") + .access("th_info") /*__kmp_threads[t]->th.th_info*/ + .cast("kmp_desc_t") + .access("ds") /*__kmp_threads[t]->th.th_info.ds*/ + .cast("kmp_desc_base_t") + .access("ds_tid") /*__kmp_threads[t]->th.th_info.ds.ds_tid*/ + .castBase() + .getValue(*val); + return ret; +} + +static ompd_rc_t +ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_flags") // td->td_icvs + .cast("kmp_tasking_flags_t") + .check("final", val); // td->td_icvs.max_active_levels + + return ret; +} + +static ompd_rc_t +ompd_get_max_active_levels( + ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = + TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_icvs") // td->td_icvs + .cast("kmp_internal_control_t", 0) + .access("max_active_levels") // td->td_icvs.max_active_levels + .castBase() + .getValue(*val); + + return ret; +} + +static ompd_rc_t +ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *kind, /* OUT: Kind of OpenMP schedule*/ + ompd_word_t *modifier /* OUT: Schedunling modifier */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + TValue sched = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_icvs") // td->td_icvs + .cast("kmp_internal_control_t", 0) + .access("sched") // td->td_icvs.sched + .cast("kmp_r_sched_t", 0); + + ompd_rc_t ret = sched + .access("r_sched_type") // td->td_icvs.sched.r_sched_type + .castBase() + .getValue(*kind); + if (ret != ompd_rc_ok) + return ret; + ret = sched + .access("chunk") // td->td_icvs.sched.r_sched_type + .castBase() + .getValue(*modifier); + return ret; +} + +static ompd_rc_t +ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *bind /* OUT: Kind of proc-binding */ + ) { + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_icvs") // td->td_icvs + .cast("kmp_internal_control_t", 0) + .access("proc_bind") // td->td_icvs.proc_bind + .castBase() + .getValue(*bind); + + return ret; +} + + +static ompd_rc_t +ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ + ompd_word_t *val /* OUT: max number of threads */ + ) { + if (!task_handle) + return ompd_rc_stale_handle; + if (!task_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = task_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = TValue(context, task_handle->th) + .cast("kmp_taskdata_t") // td + .access("td_flags") // td->td_flags + .cast("kmp_tasking_flags_t") + .check("tasktype", val); // td->td_flags.tasktype + *val ^= 1; // tasktype: explicit = 1, implicit = 0 => invert the value + return ret; +} + +static ompd_rc_t +ompd_get_num_threads(ompd_parallel_handle_t + *parallel_handle, /* IN: OpenMP parallel handle */ + ompd_word_t *val /* OUT: number of threads */ + ) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized!"); + + ompd_rc_t ret = ompd_rc_ok; + if (parallel_handle->lwt.address != 0) { + *val = 1; + } else { + uint32_t res; + ret = TValue(context, parallel_handle->th) + .cast("kmp_base_team_t", 0) /*t*/ + .access("t_nproc") /*t.t_nproc*/ + .castBase() + .getValue(res); + *val = res; + } + return ret; +} + +static ompd_rc_t +ompd_get_num_threads_cuda(ompd_parallel_handle_t *parallel_handle, + ompd_word_t *val) { + if (!parallel_handle->ah) + return ompd_rc_stale_handle; + ompd_address_space_context_t *context = parallel_handle->ah->context; + if (!context) + return ompd_rc_stale_handle; + + assert(callbacks && "Callback table not initialized"); + + uint16_t res; + + ompd_rc_t ret = TValue(context, parallel_handle->th) + .cast("ompd_nvptx_parallel_info_t", 0, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("parallel_tasks") + .cast("omptarget_nvptx_TaskDescr", 1, + OMPD_SEGMENT_CUDA_PTX_GLOBAL) + .access("items__threadsInTeam") + .castBase() + .getValue(res); + *val = res; + return ret; +} + +ompd_rc_t ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, + ompd_word_t *icv_value) { + if (!handle) { + return ompd_rc_stale_handle; + } + if (icv_id >= ompd_icv_after_last_icv || icv_id == 0) { + return ompd_rc_bad_input; + } + if (scope != ompd_icv_scope_values[icv_id]) { + return ompd_rc_bad_input; + } + + ompd_device_t device_kind; + + switch (scope) { + case ompd_scope_thread: + device_kind = ((ompd_thread_handle_t *)handle)->ah->kind; + break; + case ompd_scope_parallel: + device_kind = ((ompd_parallel_handle_t *)handle)->ah->kind; + break; + case ompd_scope_address_space: + device_kind = ((ompd_address_space_handle_t *)handle)->kind; + break; + case ompd_scope_task: + device_kind = ((ompd_task_handle_t *)handle)->ah->kind; + break; + default: + return ompd_rc_bad_input; + } + + + if (device_kind == OMPD_DEVICE_KIND_HOST) { + switch (icv_id) { + case ompd_icv_levels_var: + return ompd_get_level((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_active_levels_var: + return ompd_get_active_level((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_thread_limit_var: + return ompd_get_thread_limit((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_max_active_levels_var: + return ompd_get_max_active_levels((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_bind_var: + return ompd_get_proc_bind((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_num_procs_var: + return ompd_get_num_procs((ompd_address_space_handle_t*)handle, icv_value); + case ompd_icv_thread_num_var: + return ompd_get_thread_num((ompd_thread_handle_t*)handle, icv_value); + case ompd_icv_final_var: + return ompd_in_final((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_implicit_var: + return ompd_is_implicit((ompd_task_handle_t*)handle, icv_value); + case ompd_icv_team_size_var: + return ompd_get_num_threads((ompd_parallel_handle_t*)handle, icv_value); + default: + return ompd_rc_unsupported; + } + } else if (device_kind == OMPD_DEVICE_KIND_CUDA) { + switch (icv_id) { + case ompd_icv_levels_var: + return ompd_get_level_cuda((ompd_parallel_handle_t *)handle, icv_value); + case ompd_icv_team_size_var: + return ompd_get_num_threads_cuda((ompd_parallel_handle_t*)handle, icv_value); + default: + return ompd_rc_unsupported; + } + } + return ompd_rc_unsupported; +} + +ompd_rc_t +ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, + const char **icv_string) { + return ompd_rc_unsupported; +} diff --git a/libompd/src/omp-state.cpp b/libompd/src/omp-state.cpp new file mode 100644 index 000000000..0e64aad95 --- /dev/null +++ b/libompd/src/omp-state.cpp @@ -0,0 +1,97 @@ +#include "ompd.h" +#include "ompd-private.h" +#include "omp-debug.h" +#include + +void __ompd_init_states(const ompd_callbacks_t *table) { + callbacks = table; +} + +static const char *get_ompd_state_name(ompd_word_t state) { + switch (state) { +#define ompd_state_macro(state, code) \ + case code: return #state ; + FOREACH_OMP_STATE(ompd_state_macro) +#undef ompd_state_macro + default: return NULL; + } +} + +static const char *get_ompd_cuda_state_name(ompd_word_t state) { + switch (state) { + case omp_state_work_serial: + return "omp_state_work_serial"; + case omp_state_work_parallel: + return "omp_state_work_parallel"; + case omp_state_work_reduction: + return "omp_state_work_reduction"; + default: + return NULL; + } +} + +ompd_rc_t ompd_enumerate_states( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, ompd_word_t *next_state, + const char **next_state_name, ompd_word_t *more_enums) { + ompd_rc_t ret; + if (address_space_handle->kind == OMPD_DEVICE_KIND_CUDA) { + // We only support a small number of states for cuda devices + *more_enums = 1; + switch (current_state) { + case omp_state_undefined: + *next_state = omp_state_work_serial; + break; + case omp_state_work_serial: + *next_state = omp_state_work_parallel; + break; + case omp_state_work_parallel: + *next_state = omp_state_work_reduction; + *more_enums = 0; + break; + default: + return ompd_rc_bad_input; + } + const char *find_next_state_name = get_ompd_cuda_state_name(*next_state); + char *next_state_name_cpy; + ret = callbacks->memory_alloc( + strlen(find_next_state_name) + 1, (void **)&next_state_name_cpy); + if (ret != ompd_rc_ok) { + return ret; + } + strcpy(next_state_name_cpy, get_ompd_cuda_state_name(*next_state)); + *next_state_name = next_state_name_cpy; + } else { + if (current_state > omp_state_undefined && + current_state >= OMPD_LAST_OMP_STATE) { + return ompd_rc_bad_input; + } + if (current_state == omp_state_undefined) { + (*next_state) = omp_state_work_serial; + (*next_state_name) = get_ompd_state_name(omp_state_work_serial); + (*more_enums) = 1; + return ompd_rc_ok; + } + const char *find_next_state_name; + *next_state = current_state + 1; + while (!(find_next_state_name = get_ompd_state_name(*next_state))) { + ++(*next_state); + } + + char *next_state_name_cpy; + ret = callbacks->memory_alloc(strlen(find_next_state_name) + 1, (void **)&next_state_name_cpy); + if (ret != ompd_rc_ok) { + return ret; + } + strcpy(next_state_name_cpy, find_next_state_name); + + *next_state_name = next_state_name_cpy; + + if (*next_state == OMPD_LAST_OMP_STATE) { + *more_enums = 0; + } else { + *more_enums = 1; + } + } + return ompd_rc_ok; +} diff --git a/libompd/src/ompd-private.h b/libompd/src/ompd-private.h new file mode 100644 index 000000000..bc5a04794 --- /dev/null +++ b/libompd/src/ompd-private.h @@ -0,0 +1,74 @@ +#ifndef SRC_OMPD_PRIVATE_H_ +#define SRC_OMPD_PRIVATE_H_ + + +/* + * Definition of OMPD states, taken from OMPT + */ +#define FOREACH_OMP_STATE(macro) \ + \ + /* first available state */ \ + macro (omp_state_undefined, 0x102) /* undefined thread state */ \ + \ + /* work states (0..15) */ \ + macro (omp_state_work_serial, 0x000) /* working outside parallel */ \ + macro (omp_state_work_parallel, 0x001) /* working within parallel */ \ + macro (omp_state_work_reduction, 0x002) /* performing a reduction */ \ + \ + /* barrier wait states (16..31) */ \ + macro (omp_state_wait_barrier, 0x010) /* waiting at a barrier */ \ + macro (omp_state_wait_barrier_implicit_parallel, 0x011) \ + /* implicit barrier at the end of parallel region */\ + macro (omp_state_wait_barrier_implicit_workshare, 0x012) \ + /* implicit barrier at the end of worksharing */ \ + macro (omp_state_wait_barrier_implicit, 0x013) /* implicit barrier */ \ + macro (omp_state_wait_barrier_explicit, 0x014) /* explicit barrier */ \ + \ + /* task wait states (32..63) */ \ + macro (omp_state_wait_taskwait, 0x020) /* waiting at a taskwait */ \ + macro (omp_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */ \ + \ + /* mutex wait states (64..127) */ \ + macro (omp_state_wait_mutex, 0x040) \ + macro (omp_state_wait_lock, 0x041) /* waiting for lock */ \ + macro (omp_state_wait_critical, 0x042) /* waiting for critical */ \ + macro (omp_state_wait_atomic, 0x043) /* waiting for atomic */ \ + macro (omp_state_wait_ordered, 0x044) /* waiting for ordered */ \ + \ + /* target wait states (128..255) */ \ + macro (omp_state_wait_target, 0x080) /* waiting for target region */ \ + macro (omp_state_wait_target_map, 0x081) /* waiting for target data mapping operation */ \ + macro (omp_state_wait_target_update, 0x082) /* waiting for target update operation */ \ + \ + /* misc (256..511) */ \ + macro (omp_state_idle, 0x100) /* waiting for work */ \ + macro (omp_state_overhead, 0x101) /* overhead excluding wait states */ \ + \ + /* implementation-specific states (512..) */ + +typedef enum omp_state_t { +#define ompd_state_macro(state, code) state = code, + FOREACH_OMP_STATE(ompd_state_macro) +#undef ompd_state_macro +} omp_state_t; + +#define OMPD_LAST_OMP_STATE omp_state_overhead + + +/** + * Primitive types. + */ +typedef enum ompd_target_prim_types_t { + ompd_type_invalid = -1, + ompd_type_char = 0, + ompd_type_short = 1, + ompd_type_int = 2, + ompd_type_long = 3, + ompd_type_long_long = 4, + ompd_type_pointer = 5, + ompd_type_max +} ompd_target_prim_types_t; + +#include "ompd-types.h" + +#endif /*SRC_OMPD_PRIVATE_H*/ diff --git a/libompd/src/ompd-types.h b/libompd/src/ompd-types.h new file mode 100644 index 000000000..ea5aedef4 --- /dev/null +++ b/libompd/src/ompd-types.h @@ -0,0 +1,65 @@ +/* +* @@name: ompd_types.h +*/ +#ifndef __OPMD_TYPES_H +#define __OPMD_TYPES_H +#include "ompd.h" + +#define OMPD_TYPES_VERSION 20170927 /* YYYYMMDD Format */ + +/* Kinds of device threads */ +#define OMPD_THREAD_ID_PTHREAD ((ompd_thread_id_t)0) +#define OMPD_THREAD_ID_LWP ((ompd_thread_id_t)1) +#define OMPD_THREAD_ID_WINTHREAD ((ompd_thread_id_t)2) +#define OMPD_THREAD_ID_CUDALOGICAL ((ompd_thread_id_t)3) +/* The range of non-standard implementation defined values */ +#define OMPD_THREAD_ID_LO ((ompd_thread_id_t)1000000) +#define OMPD_THREAD_ID_HI ((ompd_thread_id_t)1100000) + +/* Target Cuda device-specific thread identification */ +typedef struct ompd_dim3_t { + ompd_addr_t x; + ompd_addr_t y; + ompd_addr_t z; +} ompd_dim3_t; + +typedef struct ompd_cudathread_coord_t { + ompd_addr_t cudaDevId; + ompd_addr_t cudaContext; + ompd_addr_t warpSize; + ompd_addr_t gridId; + ompd_dim3_t gridDim; + ompd_dim3_t blockDim; + ompd_dim3_t blockIdx; + ompd_dim3_t threadIdx; +} ompd_cudathread_coord_t; + +/* Memory Access Segment definitions for Host and Target Devices */ +#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) + +/* Cuda-specific values consistent with those defined in cudadebugger.h */ +#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) +#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) +#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) +#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) +#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) +#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) +#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) +#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) +#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) +#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) +#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) +#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) +#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) +#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) +#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) +#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) + +/* Kinds of device device address spaces */ +#define OMPD_DEVICE_KIND_HOST ((ompd_device_t)1) +#define OMPD_DEVICE_KIND_CUDA ((ompd_device_t)2) +/* The range of non-standard implementation defined values */ +#define OMPD_DEVICE_IMPL_LO ((ompd_device_t)1000000) +#define OMPD_DEVICE_IMPL_HI ((ompd_device_t)1100000) +#endif + diff --git a/libompd/src/ompd.h b/libompd/src/ompd.h index 2c97f09f4..48ae79e98 100644 --- a/libompd/src/ompd.h +++ b/libompd/src/ompd.h @@ -21,10 +21,6 @@ * - Type entities end with the suffix "_t" (for type) * - Function types end with the suffix "_fn_t" (for function type) * - Return code entities have "_rc_" in it - * - Abstractions referring to the target have the prefix "t" (e.g., - * "tmemory" for memory in the target, or "tsymbol" for symbol in the target) - * - Abstractions referring to the debugger have the prefix "d" (e.g., - * "dmemory" for memory in the debugger) * * Comment conventions: * - Input function parameters denoted by "IN:" @@ -60,102 +56,11 @@ typedef struct ompd_address_t { ompd_addr_t address; /* target address in the segment */ } ompd_address_t; -#define OMPD_SEGMENT_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_TEXT ((ompd_seg_t)1) -#define OMPD_SEGMENT_DATA ((ompd_seg_t)2) +const uint64_t ompd_segment_none = 0; -/** - * The following definitions match with ptx information stored in DWARF - */ -#define OMPD_SEGMENT_CUDA_PTX_UNSPECIFIED ((ompd_seg_t)0) -#define OMPD_SEGMENT_CUDA_PTX_CODE ((ompd_seg_t)1) -#define OMPD_SEGMENT_CUDA_PTX_REG ((ompd_seg_t)2) -#define OMPD_SEGMENT_CUDA_PTX_SREG ((ompd_seg_t)3) -#define OMPD_SEGMENT_CUDA_PTX_CONST ((ompd_seg_t)4) -#define OMPD_SEGMENT_CUDA_PTX_GLOBAL ((ompd_seg_t)5) -#define OMPD_SEGMENT_CUDA_PTX_LOCAL ((ompd_seg_t)6) -#define OMPD_SEGMENT_CUDA_PTX_PARAM ((ompd_seg_t)7) -#define OMPD_SEGMENT_CUDA_PTX_SHARED ((ompd_seg_t)8) -#define OMPD_SEGMENT_CUDA_PTX_SURF ((ompd_seg_t)9) -#define OMPD_SEGMENT_CUDA_PTX_TEX ((ompd_seg_t)10) -#define OMPD_SEGMENT_CUDA_PTX_TEXSAMPLER ((ompd_seg_t)11) -#define OMPD_SEGMENT_CUDA_PTX_GENERIC ((ompd_seg_t)12) -#define OMPD_SEGMENT_CUDA_PTX_IPARAM ((ompd_seg_t)13) -#define OMPD_SEGMENT_CUDA_PTX_OPARAM ((ompd_seg_t)14) -#define OMPD_SEGMENT_CUDA_PTX_FRAME ((ompd_seg_t)15) -#define OMPD_SEGMENT_CUDA_PTX_MAX ((ompd_seg_t)16) - -#if 0 // types removed in Austin F2F -/* - * Definition of OMPD states, taken from OMPT - */ -#define FOREACH_OMPD_STATE(macro) \ - \ - /* first */ \ - macro(ompd_state_first, 0x71) /* initial enumeration state */ \ - \ - /* work states (0..15) */ \ - macro(ompd_state_work_serial, 0x00) /* working outside parallel */ \ - macro(ompd_state_work_parallel, 0x01) /* working within parallel */ \ - macro(ompd_state_work_reduction, 0x02) /* performing a reduction */ \ - \ - /* idle (16..31) */ \ - macro(ompd_state_idle, 0x10) /* waiting for work */ \ - \ - /* overhead states (32..63) */ \ - macro(ompd_state_overhead, 0x20) /* overhead excluding wait states */ \ - \ - /* barrier wait states (64..79) */ \ - macro(ompd_state_wait_barrier, 0x40) /* waiting at a barrier */ \ - macro(ompd_state_wait_barrier_implicit, 0x41) /* implicit barrier */ \ - macro(ompd_state_wait_barrier_explicit, 0x42) /* explicit barrier */ \ - \ - /* task wait states (80..95) */ \ - macro(ompd_state_wait_taskwait, 0x50) /* waiting at a taskwait */ \ - macro(ompd_state_wait_taskgroup, 0x51) /* waiting at a taskgroup */ \ - \ - /* mutex wait states (96..111) */ \ - macro(ompd_state_wait_lock, 0x60) /* waiting for lock */ \ - macro(ompd_state_wait_nest_lock, 0x61) /* waiting for nest lock */ \ - macro(ompd_state_wait_critical, 0x62) /* waiting for critical */ \ - macro(ompd_state_wait_atomic, 0x63) /* waiting for atomic */ \ - macro(ompd_state_wait_ordered, 0x64) /* waiting for ordered */ \ - macro(ompd_state_wait_single, \ - 0x6F) /* waiting for single region (non-standard!) */ \ - \ - /* misc (112..127) */ \ - macro(ompd_state_undefined, 0x70) /* undefined thread state */ - -typedef enum ompd_state_t { -#define ompd_state_macro(state, code) state = code, - FOREACH_OMPD_STATE(ompd_state_macro) -#undef ompd_state_macro -} ompd_state_t; - -typedef enum ompd_sched_t { - ompd_sched_static = 1, - ompd_sched_dynamic = 2, - ompd_sched_guided = 3, - ompd_sched_auto = 4, - ompd_sched_vendor_lo = 5, - ompd_sched_vendor_hi = 0x7fffffff -} ompd_sched_t; - -typedef enum ompd_proc_bind_t { - ompd_proc_bind_false = 0, - ompd_proc_bind_true = 1, - ompd_proc_bind_master = 2, - ompd_proc_bind_close = 3, - ompd_proc_bind_spread = 4 -} ompd_proc_bind_t; -#endif - -typedef uint64_t ompd_device_identifier_t; - -typedef enum ompd_device_kind_t { - ompd_device_kind_host = 1, - ompd_device_kind_cuda = 2 -} ompd_device_kind_t; +/* types for device and thread id KIND, not for the actual thread/device id */ +typedef uint64_t ompd_device_t; +typedef uint64_t ompd_thread_id_t; /** * Context handle. @@ -185,42 +90,19 @@ typedef struct _ompd_task_handle_s ompd_task_handle_t; typedef struct _ompd_address_space_handle_s ompd_address_space_handle_t; /** - * Other handles. + * Scope for ICVs */ -#define OMPD_THREAD_ID_PTHREAD 0 -#define OMPD_THREAD_ID_LWP 1 -#define OMPD_THREAD_ID_WINTHREAD 2 -#define OMPD_THREAD_ID_CUDALOGICAL 3 -#define OMPD_THREAD_ID_MAX 4 - -typedef enum ompd_thread_id_kind_t { - ompd_thread_id_pthread = 0, - ompd_thread_id_lwp = 1, - ompd_thread_id_winthread = 2, - ompd_thread_id_cudalogical = 3 -} ompd_thread_id_kind_t; +typedef enum ompd_scope_t { + ompd_scope_global = 1, + ompd_scope_address_space = 2, + ompd_scope_thread = 3, + ompd_scope_parallel = 4, + ompd_scope_implicit_task = 5, + ompd_scope_task = 6 +} ompd_scope_t; -/** - * Logical coordinates of OMP target device threads - */ -typedef struct ompd_dim3_t { - ompd_word_t x; - ompd_word_t y; - ompd_word_t z; -} ompd_dim3_t; - -typedef struct ompd_cudathread_coord_t { - ompd_addr_t cudaDevId; - ompd_addr_t cudaContext; - ompd_addr_t warpSize; - ompd_addr_t gridId; - ompd_addr_t kernelId; // TODO (MJM) - for some reason, cuda-gdb doesn't work - // with grids too well. - ompd_dim3_t gridDim; - ompd_dim3_t blockDim; - ompd_dim3_t blockIdx; - ompd_dim3_t threadIdx; -} ompd_cudathread_coord_t; +typedef uint64_t ompd_icv_id_t; +const uint64_t ompd_icv_undefined = 0; /** * Return codes. @@ -240,33 +122,19 @@ typedef enum ompd_rc_t { ompd_rc_nomem = 10 /* unable to allocate memory */ } ompd_rc_t; -/** - * Primitive types. - */ -typedef enum ompd_target_prim_types_t { - ompd_type_invalid = -1, - ompd_type_char = 0, - ompd_type_short = 1, - ompd_type_int = 2, - ompd_type_long = 3, - ompd_type_long_long = 4, - ompd_type_pointer = 5, - ompd_type_max -} ompd_target_prim_types_t; - /** * Primitive type sizes. * These types are used by OMPD to interrogate the debugger about the size of * primitive types in the target. */ -typedef struct ompd_target_type_sizes_t { +typedef struct ompd_device_type_sizes_t { uint8_t sizeof_char; uint8_t sizeof_short; uint8_t sizeof_int; uint8_t sizeof_long; uint8_t sizeof_long_long; uint8_t sizeof_pointer; -} ompd_target_type_sizes_t; +} ompd_device_type_sizes_t; /****************************************************************************** * Debugger callback signatures. @@ -280,7 +148,7 @@ typedef struct ompd_target_type_sizes_t { /** * Allocate memory in the debugger's address space. */ -typedef ompd_rc_t (*ompd_dmemory_alloc_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_alloc_fn_t)( ompd_size_t bytes, /* IN: bytes of the primitive type */ void **ptr /* OUT: pointer of the allocated memory */ ); @@ -288,43 +156,31 @@ typedef ompd_rc_t (*ompd_dmemory_alloc_fn_t)( /** * Free memory in the debugger's address space. */ -typedef ompd_rc_t (*ompd_dmemory_free_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_free_fn_t)( void *ptr /* IN: pointer of memory to deallocate */ ); /** * Get thread specific context. */ -typedef ompd_rc_t (*ompd_get_thread_context_for_thread_id_fn_t)( - ompd_address_space_context_t *context, ompd_thread_id_kind_t kind, +typedef ompd_rc_t (*ompd_callback_get_thread_context_for_thread_id_fn_t)( + ompd_address_space_context_t *context, ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, const void *thread_id, ompd_thread_context_t **thread_context); -#if 0 -/** - * Get containing (host) process context for address_space_context - */ -typedef ompd_rc_t (*ompd_get_process_context_for_context_fn_t) ( - ompd_address_space_context_t* - address_space_context, /* IN: OMP device/process addr space */ - ompd_address_space_context_t** - containing_address_space_context /* OUT: Containing omp process addr space */ -); -#endif - /** * Look up the sizes of primitive types in the target */ -typedef ompd_rc_t (*ompd_tsizeof_prim_fn_t)( +typedef ompd_rc_t (*ompd_callback_sizeof_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ - ompd_target_type_sizes_t *sizes /* OUT: type sizes */ + ompd_device_type_sizes_t *sizes /* OUT: type sizes */ ); /** * Look up the address of a global symbol in the target */ -typedef ompd_rc_t (*ompd_tsymbol_addr_fn_t)( +typedef ompd_rc_t (*ompd_callback_symbol_addr_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_thread_context_t @@ -336,7 +192,7 @@ typedef ompd_rc_t (*ompd_tsymbol_addr_fn_t)( /** * Read memory from the target */ -typedef ompd_rc_t (*ompd_tmemory_read_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_read_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_thread_context_t @@ -349,7 +205,7 @@ typedef ompd_rc_t (*ompd_tmemory_read_fn_t)( /** * Write memory from the target */ -typedef ompd_rc_t (*ompd_tmemory_write_fn_t)( +typedef ompd_rc_t (*ompd_callback_memory_write_fn_t)( ompd_address_space_context_t *context, /* IN: debugger handle for the target */ ompd_thread_context_t @@ -359,7 +215,7 @@ typedef ompd_rc_t (*ompd_tmemory_write_fn_t)( const void *buffer /* IN: output buffer */ ); -typedef ompd_rc_t (*ompd_target_host_fn_t)( +typedef ompd_rc_t (*ompd_callback_device_host_fn_t)( ompd_address_space_context_t *address_space_context, /* IN */ const void *input, /* IN */ int unit_size, /* IN */ @@ -372,7 +228,7 @@ typedef ompd_rc_t (*ompd_target_host_fn_t)( * This is used by the OMPD library to have the debugger print a string. * The OMPD should not print directly. */ -typedef ompd_rc_t (*ompd_print_string_fn_t)( +typedef ompd_rc_t (*ompd_callback_print_string_fn_t)( const char *str /* IN: message to print */ ); @@ -381,34 +237,32 @@ typedef ompd_rc_t (*ompd_print_string_fn_t)( */ typedef struct ompd_callbacks_t { /* Debugger interface */ - ompd_dmemory_alloc_fn_t dmemory_alloc; - ompd_dmemory_free_fn_t dmemory_free; - ompd_print_string_fn_t print_string; + ompd_callback_memory_alloc_fn_t memory_alloc; + ompd_callback_memory_free_fn_t memory_free; + ompd_callback_print_string_fn_t print_string; /* Target interface */ - ompd_tsizeof_prim_fn_t tsizeof_prim; - ompd_tsymbol_addr_fn_t tsymbol_addr; - ompd_tmemory_read_fn_t read_tmemory; - ompd_tmemory_write_fn_t write_tmemory; - - ompd_target_host_fn_t target_to_host; - ompd_target_host_fn_t host_to_target; + ompd_callback_sizeof_fn_t sizeof_types; + ompd_callback_symbol_addr_fn_t symbol_addr_lookup; + ompd_callback_memory_read_fn_t read_memory; + ompd_callback_memory_write_fn_t write_memory; - ompd_get_thread_context_for_thread_id_fn_t get_thread_context_for_thread_id; - // ompd_get_process_context_for_context_fn_t get_containing_process_context; + ompd_callback_device_host_fn_t device_to_host; + ompd_callback_device_host_fn_t host_to_device; + ompd_callback_get_thread_context_for_thread_id_fn_t get_thread_context_for_thread_id; } ompd_callbacks_t; /****************************************************************************** * Call signatures from the debugger to the OMPD DLL. */ -/* --- 4 Initialization ----------------------------------------------------- */ +/* --- 4.1 Initialization --------------------------------------------------- */ /** * The OMPD function ompd_get_version_string returns a descriptive string * describing an implementation of the OMPD library. The function - * ompd_get_version_compatibility returns an integer code used to indicate the + * ompd_get_api_version returns an integer code used to indicate the * revision of the OMPD specification supported by an implementation of OMPD. */ @@ -425,8 +279,13 @@ ompd_get_api_version_string(const char **string /* OUT: OMPD version string */ * maintain the functions valid for as long as needed. */ ompd_rc_t -ompd_initialize(const ompd_callbacks_t *table, /* IN: callbacks table */ - ompd_word_t version); +ompd_initialize(ompd_word_t version, + const ompd_callbacks_t *table /* IN: callbacks table */ + ); + +ompd_rc_t ompd_finalize(void); + +/* --- 4.2 Per Process Initialization and Finalization ---------------------- */ ompd_rc_t ompd_process_initialize(ompd_address_space_context_t @@ -435,53 +294,34 @@ ompd_process_initialize(ompd_address_space_context_t *addrhandle /* OUT: ompd handle for the target */ ); -ompd_rc_t -ompd_get_openmp_version(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *version); - -ompd_rc_t ompd_get_openmp_version_string( - ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - const char **string); +ompd_rc_t ompd_device_initialize( + ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ + ompd_address_space_context_t *device_context, + int kind, + ompd_size_t sizeof_id, + void *id, + ompd_address_space_handle_t **device_handle + ); ompd_rc_t ompd_release_address_space_handle( ompd_address_space_handle_t *addr_handle /* IN: handle for the address space */ ); -ompd_rc_t ompd_device_initialize( - ompd_address_space_context_t - *context, /* IN: debugger handle for the device */ - ompd_device_identifier_t id, /* IN: object defined by native device API */ - ompd_device_kind_t kind, /* IN: */ - ompd_address_space_handle_t * - *addrhandle /* OUT: ompd handle for the device */ - ); +/* --- 4.4 Address Space Information ---------------------------------------- */ -ompd_rc_t ompd_finalize(void); -/* --- 4 Handle Management -------------------------------------------------- */ +ompd_rc_t +ompd_get_omp_version(ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_word_t *version); -/* --- 4.1 Thread Handles --------------------------------------------------- */ +ompd_rc_t ompd_get_omp_version_string( + ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + const char **string); + +/* --- 4.5 Thread Handles --------------------------------------------------- */ -/** - * Retrieve handles for all OpenMP threads. - * - * The ompd_get_threads operation enables the debugger to obtain handles for all - * OpenMP threads. A successful invocation of ompd_get_threads returns a pointer - * to a vector of handles in thread_handle_array and returns the number of - * handles in num_handles. This call yields meaningful results only if all - * OpenMP threads are stopped; otherwise, the OpenMP runtime may be creating - * and/or destroying threads during or after the call, rendering useless the - * vector of handles returned. - */ -#if 0 -ompd_rc_t ompd_get_threads ( - ompd_address_space_handle_t *addr_handle, /* IN: handle for the address space */ - ompd_thread_handle_t ***thread_handle_array, /* OUT: array of handles */ - int *num_handles /* OUT: number of handles in the array */ - ); -#endif /** * Retrieve handles for OpenMP threads in a parallel region. * @@ -500,11 +340,21 @@ ompd_rc_t ompd_get_thread_in_parallel( ompd_thread_handle_t **thread_handle /* OUT: handle */ ); -#if 0 -ompd_rc_t ompd_get_master_thread_in_parallel ( - ompd_parallel_handle_t *parallel_handle, /* IN */ - ompd_thread_handle_t **thread_handle); -#endif +/** + * Obtain an OpenMP thread handle and the internal OS thread handle for the + * selected (context) thread. + * If the function returns ompd_rc_ok then the operating system thread + * corresponds to an OpenMP thread and the thread_handle is initialized. The + * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime + * system. + */ +ompd_rc_t ompd_get_thread_handle( + ompd_address_space_handle_t + *addr_handle, /* IN: handle for the address space */ + ompd_thread_id_t kind, + ompd_size_t sizeof_thread_id, const void *thread_id, + ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ + ); ompd_rc_t ompd_release_thread_handle(ompd_thread_handle_t *thread_handle); @@ -512,14 +362,16 @@ ompd_rc_t ompd_thread_handle_compare(ompd_thread_handle_t *thread_handle_1, ompd_thread_handle_t *thread_handle_2, int *cmp_value); -#if 0 -ompd_rc_t ompd_get_thread_handle_string_id ( - ompd_thread_handle_t *thread_handle, - char **string_id -); -#endif +/** + * Obtain the OS thread handle for an OpenMP thread handle. + * this might change over time in case virtual openmp threads migrate between + * OS threads. + */ +ompd_rc_t ompd_get_thread_id( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_thread_id_t kind, ompd_size_t sizeof_thread_id, void *thread_id); -/* --- 4.2 Parallel Region Handles------------------------------------------- */ +/* --- 4.6 Parallel Region Handles------------------------------------------- */ /** * Retrieve the handle for the innermost patallel region for an OpenMP thread. @@ -572,14 +424,7 @@ ompd_parallel_handle_compare(ompd_parallel_handle_t *parallel_handle_1, ompd_parallel_handle_t *parallel_handle_2, int *cmp_value); -#if 0 -ompd_rc_t ompd_get_parallel_handle_string_id ( - ompd_parallel_handle_t *parallel_handle, - char **string_id -); -#endif - -/* --- 4.3 Task Handles ----------------------------------------------------- */ +/* --- 4.7 Task Handles ----------------------------------------------------- */ /** * Retrieve the handle for the innermost task for an OpenMP thread. @@ -589,7 +434,7 @@ ompd_rc_t ompd_get_parallel_handle_string_id ( * for the innermost task region associated with an OpenMP thread. This call is * meaningful only if the thread whose handle is provided is stopped. */ -ompd_rc_t ompd_get_current_task__handle( +ompd_rc_t ompd_get_current_task_handle( ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ ompd_task_handle_t **task_handle /* OUT: OpenMP task handle */ ); @@ -602,19 +447,13 @@ ompd_rc_t ompd_get_current_task__handle( * meaningful only if the thread executing the task specified by task_handle is * stopped. */ -#if 0 -ompd_rc_t ompd_get_ancestor_task_handle( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ - ); -#endif -ompd_rc_t ompd_get_generating_ancestor_task_handle( +ompd_rc_t ompd_get_generating_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ); -ompd_rc_t ompd_get_scheduling_ancestor_task_handle( +ompd_rc_t ompd_get_scheduling_task_handle( ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ ompd_task_handle_t **parent_task_handle /* OUT: OpenMP task handle */ ); @@ -639,212 +478,11 @@ ompd_rc_t ompd_task_handle_compare(ompd_task_handle_t *task_handle_1, ompd_task_handle_t *task_handle_2, int *cmp_value); -#if 0 -ompd_rc_t ompd_get_task_handle_string_id ( - ompd_task_handle_t *task_handle, - char **string_id -); -#endif - -/* --- 5o Process and Thread Settings ---------------------------------------- - */ - -/** - * The functions ompd_get_num_procs and ompd_get_thread_limit are third-party - * versions of the OpenMP runtime functions omp_get_num_procs and - * omp_get_thread_limit. - */ - -ompd_rc_t -ompd_get_num_procs(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: number of processes */ - ); - -ompd_rc_t -ompd_get_thread_limit(ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_word_t *val /* OUT: max number of threads */ - ); - -/* --- 6 Parallel Region Inqueries ------------------------------------------ */ -/* --- 6.1 Settings --------------------------------------------------------- */ - -/** - * Determine the number of threads associated with a parallel region. - */ -ompd_rc_t ompd_get_num_threads( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: number of threads */ - ); - -/** - * Determine the nesting depth of a particular parallel region instance. - */ -ompd_rc_t ompd_get_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: nesting level */ - ); - -/** - * Determine the number of enclosing active parallel regions. - * - * ompd_get_active_level returns the number of nested, active parallel regions - * enclosing the parallel region specified by its handle. - */ -ompd_rc_t ompd_get_active_level( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_word_t *val /* OUT: active nesting level */ - ); - -/* --- 6.2 OMPT Parallel Region Inquiry Analogues ------------------------- */ - -/** - * The functions ompd_get_parallel_id and ompd_get_parallel_function are - * third-party variants of their OMPT counterparts. The only difference between - * the OMPD and OMPT versions is that the OMPD must supply a parallel region - * handle to provide a context for these inquiries. - */ -ompd_rc_t ompd_get_parallel_data( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *data /* OUT: OpenMP parallel id */ - ); - -#if 0 -ompd_rc_t ompd_get_parallel_function( - ompd_parallel_handle_t *parallel_handle, /* IN: OpenMP parallel handle */ - ompd_address_t *parallel_addr /* OUT: first instruction in the parallel region */ - ); -#endif - -/* --- 7 Thread Inquiry ----------------------------------------------------- */ -/* --- 7.1 Operating System Thread Inquiry ---------------------------------- */ - -/** - * Obtain an OpenMP thread handle and the internal OS thread handle for the - * selected (context) thread. - * If the function returns ompd_rc_ok then the operating system thread - * corresponds to an OpenMP thread and the thread_handle is initialized. The - * value of thread_handle ans os_thread is meaningful only to the OpenMP runtime - * system. - */ -ompd_rc_t ompd_get_thread_handle( - ompd_address_space_handle_t - *addr_handle, /* IN: handle for the address space */ - ompd_thread_id_kind_t kind, - ompd_size_t sizeof_thread_id, const void *thread_id, - ompd_thread_handle_t **thread_handle /* OUT: OpenMP thread handle*/ - ); - -/** - * Obtain the OS thread handle for an OpenMP thread handle. - * this might change over time in case virtual openmp threads migrate between - * OS threads. - */ -ompd_rc_t ompd_get_thread_id( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_thread_id_kind_t kind, ompd_size_t sizeof_thread_id, void *thread_id); - -ompd_rc_t ompd_get_thread_data( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_address_t *data /* OUT: OpenMP thread data */ - ); - -ompd_rc_t ompd_get_thread_num( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *val /* OUT: number of the thread within the team */ - ); - -/* --- 7.2 OMPT Thread State Inquiry Analogue ------------------------------- */ - -/** - * Get the state of a thread. This can use OMPT state data structure to define - * different states of threads (e.g., idle, working, or barrier, etc) and what - * entity cased this state (e.g., address of a lock); - * - * The function ompd_get_state is a third-party version of ompt_get_state. The - * only difference between the OMPD and OMPT counterparts is that the OMPD - * version must supply a thread handle to provide a context for this inquiry. - */ -ompd_rc_t ompd_get_state( - ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ - ompd_word_t *state, /* OUT: State of this thread */ - ompd_wait_id_t *wait_id /* OUT: Wait ID */ - ); - -/* --- 8 Task Inquiry ------------------------------------------------------- */ - -/* --- 8.1 Task Function Entry Point ---------------------------------------- */ - -/** - * The ompd_get_task_function returns the entry point of the code that - * corresponds to the body of code executed by the task. - */ -#if 0 ompd_rc_t ompd_get_task_function( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_address_t *entry_point /* OUT: first instruction in the task region */ - ); -#endif - -/* --- 8.2 Task Settings ---------------------------------------------------- */ - -/** - * Retrieve information from OpenMP tasks. These inquiry functions have no - * counterparts in the OMPT interface as a first-party tool can call OpenMP - * runtime inquiry functions directly. The only difference between the OMPD - * inquiry operations and their counterparts in the OpenMP runtime is that the - * OMPD version must supply a task handle to provide a context for each inquiry. - */ - -ompd_rc_t ompd_get_max_threads( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: max number of threads */ - ); - -ompd_rc_t -ompd_in_parallel(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: Is OpenMP in parallel? */ - ); - -ompd_rc_t -ompd_in_final(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: Is OpenMP in final? */ - ); - -ompd_rc_t -ompd_get_dynamic(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: ? */ - ); - -ompd_rc_t -ompd_get_nested(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_word_t *val /* OUT: Is this task nested? */ - ); - -ompd_rc_t ompd_get_max_active_levels( - ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_word_t *val /* OUT: max active levels */ - ); - -ompd_rc_t -ompd_get_schedule(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *kind, /* OUT: Kind of OpenMP schedule*/ - ompd_word_t *modifier /* OUT: Schedunling modifier */ - ); - -ompd_rc_t -ompd_get_proc_bind(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *bind /* OUT: Kind of proc-binding */ - ); - -ompd_rc_t -ompd_is_implicit(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle*/ - ompd_word_t *val /* OUT: implicit=1, explicit=0 */ - ); + ompd_task_handle_t *task_handle, + ompd_address_t *entry_point); -/* --- 8.3 OMPT Task Inquiry Analogues -------------------------------------- */ /** * The functions defined here are third-party versions of ompt_get_task_frame @@ -876,12 +514,31 @@ ompd_rc_t ompd_get_task_frame( ompd_address_t *sp_reentry /* OUT: previous frame is user code */ ); -ompd_rc_t -ompd_get_task_data(ompd_task_handle_t *task_handle, /* IN: OpenMP task handle */ - ompd_address_t *task_data /* OUT: OpenMP task ID */ - ); -/* --- 13 Display Control Variables ----------------------------------------- */ +/** + * Get the state of a thread. This can use OMPT state data structure to define + * different states of threads (e.g., idle, working, or barrier, etc) and what + * entity cased this state (e.g., address of a lock); + * + * The function ompd_get_state is a third-party version of ompt_get_state. The + * only difference between the OMPD and OMPT counterparts is that the OMPD + * version must supply a thread handle to provide a context for this inquiry. + */ +ompd_rc_t ompd_enumerate_states ( + ompd_address_space_handle_t *address_space_handle, + ompd_word_t current_state, + ompd_word_t *next_state, + const char **next_state_name, + ompd_word_t *more_enums + ); + +ompd_rc_t ompd_get_state( + ompd_thread_handle_t *thread_handle, /* IN: OpenMP thread handle*/ + ompd_word_t *state, /* OUT: State of this thread */ + ompd_wait_id_t *wait_id /* OUT: Wait ID */ + ); + +/* --- 4.8 Display Control Variables ---------------------------------------- */ /** * Using the ompd_display_control_vars function, the debugger can extract a @@ -905,6 +562,21 @@ ompd_rc_t ompd_release_display_control_vars( const char *const **control_var_values /* IN */ ); +/* --- 4.9 Internal Control Variables --------------------------------------- */ + +ompd_rc_t +ompd_enumerate_icvs(ompd_address_space_handle_t *handle, ompd_icv_id_t current, + ompd_icv_id_t *next_id, const char **next_icv_name, + ompd_scope_t *next_scope, int *more); + +ompd_rc_t +ompd_get_icv_from_scope(void *handle, ompd_scope_t scope, ompd_icv_id_t icv_id, + ompd_word_t *icv_value); + +ompd_rc_t +ompd_get_icv_string_from_scope(void *handle, ompd_scope_t scope, + ompd_icv_id_t icv_id, const char **icv_string); + #ifdef __cplusplus } #endif diff --git a/libompd/src/ompd_test.c b/libompd/src/ompd_test.c index 92609a66a..f54385730 100644 --- a/libompd/src/ompd_test.c +++ b/libompd/src/ompd_test.c @@ -59,7 +59,7 @@ void test_CB_tsizeof_prim() { test_print_header(); ompd_rc_t ret; - ompd_target_type_sizes_t sizes; + ompd_device_type_sizes_t sizes; ret = callbacks->tsizeof_prim((ompd_context_t *)1, &sizes); if (ret == ompd_rc_ok) { printf("%-20s %du\n", "Size of char:", sizes.sizeof_char); diff --git a/libomptarget/CMakeLists.txt b/libomptarget/CMakeLists.txt index 3d9c78a72..8b721c0e7 100644 --- a/libomptarget/CMakeLists.txt +++ b/libomptarget/CMakeLists.txt @@ -53,6 +53,12 @@ if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) add_definitions(-O0) endif() +# OMPD support for libomptarget (currently only with cuda) +set(LIBOMPTARGET_OMPD_SUPPORT FALSE CACHE BOOL "OMPD-support?") +if (LIBOMPTARGET_OMPD_SUPPORT) + add_definitions(-DOMPD_SUPPORT=1) +endif() + include_directories(include) # Build target agnostic offloading library. diff --git a/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake new file mode 100644 index 000000000..5c6934011 --- /dev/null +++ b/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -0,0 +1,112 @@ +# +#//===----------------------------------------------------------------------===// +#// +#// The LLVM Compiler Infrastructure +#// +#// This file is dual licensed under the MIT and the University of Illinois Open +#// Source Licenses. See LICENSE.txt for details. +#// +#//===----------------------------------------------------------------------===// +# + +# We use the compiler and linker provided by the user, attempt to use the one +# used to build libomptarget or just fail. +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE) + +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) +elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) +else() + return() +endif() + +# Get compiler directory to try to locate a suitable linker. +get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY) +set(llvm_link "${compiler_dir}/llvm-link") + +if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) +elseif (EXISTS "${llvm_link}") + # Use llvm-link from the compiler directory. + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}") +else() + return() +endif() + +function(try_compile_bitcode output source) + set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu) + file(WRITE ${srcfile} "${source}\n") + set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc) + + # The remaining arguments are the flags to be tested. + # FIXME: Don't hardcode GPU version. This is currently required because + # Clang refuses to compile its default of sm_20 with CUDA 9. + execute_process( + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN} + --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile} + RESULT_VARIABLE result + OUTPUT_QUIET ERROR_QUIET) + if (result EQUAL 0) + set(${output} TRUE PARENT_SCOPE) + else() + set(${output} FALSE PARENT_SCOPE) + endif() +endfunction() + +# Save for which compiler we are going to do the following checks so that we +# can discard cached values if the user specifies a different value. +set(discard_cached FALSE) +if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND + NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}")) + set(discard_cached TRUE) +endif() +set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE) + +function(check_bitcode_compilation output source) + if (${discard_cached} OR NOT DEFINED ${output}) + message(STATUS "Performing Test ${output}") + # Forward additional arguments which contain the flags. + try_compile_bitcode(result "${source}" ${ARGN}) + set(${output} ${result} CACHE INTERNAL "" FORCE) + if(${result}) + message(STATUS "Performing Test ${output} - Success") + else() + message(STATUS "Performing Test ${output} - Failed") + endif() + endif() +endfunction() + +# These flags are required to emit LLVM Bitcode. We check them together because +# if any of them are not supported, there is no point in finding out which are. +set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) +set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }") +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required}) + +# It makes no sense to continue given that the compiler doesn't support +# emitting basic LLVM Bitcode +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED) + return() +endif() + +set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required}) + +# Declaring external shared device variables might need an additional flag +# since Clang 7.0 and was entirely unsupported since version 4.0. +set(extern_device_shared_src "extern __device__ __shared__ int test;") + +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}) +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED) + set(compiler_flag_fcuda_rdc -fcuda-rdc) + set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc}) + check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full}) + + if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC) + return() + endif() + + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}") +endif() + +# We can compile LLVM Bitcode from CUDA source code! +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE) diff --git a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt index d9a76c2c6..b0fdc5b4d 100644 --- a/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -54,6 +54,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) src/reduction.cu src/sync.cu src/task.cu + src/ompd-specific.cu ) set(omp_data_objects src/omp_data.cu) @@ -89,126 +90,95 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND) OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) # Install device RTL under the lib destination folder. - install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "lib") + install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}") target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) + # Check if we can create an LLVM bitcode implementation of the runtime library - # that could be inlined in the user implementation. - set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL + # that could be inlined in the user application. For that we need to find + # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and + # an LLVM linker. + set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") + set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + + include(LibomptargetNVPTXBitcodeLibrary) + + set(bclib_default FALSE) + if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + set(bclib_default TRUE) + endif() + set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL "Enable CUDA LLVM bitcode offloading device RTL.") if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB}) + if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!") + endif() + libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") - # Find a clang compiler capable of compiling cuda files to LLVM bitcode and - # an LLVM linker. - # We use the one provided by the user, attempt to use the one used to build - # libomptarget or just fail. - - set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING - "Location of a CUDA compiler capable of emitting LLVM bitcode.") - set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING - "Location of a linker capable of linking LLVM bitcode objects.") - - if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) - elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) + # Set flags for LLVM Bitcode compilation. + set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} -DOMPTARGET_NVPTX_TEST=0) + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) else() - libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.") - libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER") + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) endif() - # Get compiler directory to try to locate a suitable linker - get_filename_component(COMPILER_DIR ${CMAKE_C_COMPILER} DIRECTORY) - - if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") - set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) - elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND EXISTS "${COMPILER_DIR}/llvm-link") - # Use llvm-link from the directory containing clang - set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link) - else() - libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.") - libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER") + if(${LIBOMPTARGET_OMPD_SUPPORT}) + set(bc_flags ${bc_flags} -DOMPD_SUPPORT=1) endif() - if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER) - libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") - - # Decide which ptx version to use. Same choices as Clang. - if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9) - set(CUDA_PTX_VERSION ptx60) - else() - set(CUDA_PTX_VERSION ptx42) - endif() - - set(BC_DEBUG -DOMPTARGET_NVPTX_DEBUG=0) - if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(BC_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1) - endif() - - # Set flags for Clang cuda compilation. Only Clang is supported because there is - # no other compiler capable of generating bitcode from cuda sources. - set(CUDA_FLAGS - -emit-llvm - -O1 - -Xclang -target-feature - -Xclang +${CUDA_PTX_VERSION} - --cuda-device-only - -DOMPTARGET_NVPTX_TEST=0 - ${BC_DEBUG} - ) + # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared + # to handle. Therefore, we use 'weak' instead. We are compiling only for the + # device, so it should be equivalent. + if(CUDA_VERSION_MAJOR GREATER 8) + set(bc_flags ${bc_flags} -Dnv_weak=weak) + endif() - # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared - # to handle. Therefore, we use 'weak' instead. We are compiling only for the - # device, so it should be equivalent. - if(CUDA_VERSION_MAJOR EQUAL 9) - set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak) - endif() - - # Get the compute capability the user requested or use SM_35 by default. - set(CUDA_ARCH "") - foreach(sm ${nvptx_sm_list}) - set(CUDA_ARCH --cuda-gpu-arch=sm_${sm}) - - # Compile cuda files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) - - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() - - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" + # Generate a Bitcode library for all the compute capabilities the user requested. + foreach(sm ${nvptx_sm_list}) + set(cuda_arch --cuda-gpu-arch=sm_${sm}) + + # Compile CUDA files to bitcode. + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + + add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} + -c ${infile} -o ${outfile}-sm_${sm}.bc + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" + VERBATIM ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) + list(APPEND bc_files ${outfile}-sm_${sm}.bc) + endforeach() - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - $) + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - # Install device RTL under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "lib") - endforeach() - endif() + add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) + + # Copy library to destination. + add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + $) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() endif() else() diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h index 84f6ec608..a02d962f6 100644 --- a/libomptarget/deviceRTLs/nvptx/src/interface.h +++ b/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -516,4 +516,7 @@ EXTERN void __kmpc_data_sharing_environment_end( EXTERN void * __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, int16_t IsOMPRuntimeInitialized); + +// SPMD execution mode interrogation function. +EXTERN int8_t __kmpc_is_spmd_exec_mode(); #endif diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index e76475238..0cb15f095 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -14,7 +14,9 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" - +#ifdef OMPD_SUPPORT + #include "ompd-specific.h" +#endif /*OMPD_SUPPORT*/ //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // template class that encapsulate all the helper functions @@ -86,7 +88,7 @@ public: T inputUb = ub; ub = lb + chunk - 1; // Clang uses i <= ub - last = ub == inputUb; + last = lb <= inputUb && inputUb <= ub; stride = loopSize; // make sure we only do 1 chunk per warp } @@ -96,8 +98,8 @@ public: INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter, T *plower, T *pupper, ST *pstride, ST chunk, bool IsSPMDExecutionMode, - bool IsOMPRuntimeUnavailable = false) { - // When IsOMPRuntimeUnavailable is true, we assume that the caller is + bool IsRuntimeUninitialized) { + // When IsRuntimeUninitialized is true, we assume that the caller is // in an L0 parallel region and that all worker threads participate. int tid = GetLogicalThreadIdInBlock(); @@ -105,23 +107,23 @@ public: // Assume we are in teams region or that we use a single block // per target region ST numberOfActiveOMPThreads = GetNumberOfOmpThreads( - tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + tid, IsSPMDExecutionMode, IsRuntimeUninitialized); // All warps that are in excess of the maximum requested, do // not execute the loop PRINT(LD_LOOP, "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " "%d, num tids %d\n", - GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable), + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized), schedtype, P64(chunk), - GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable), + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized), GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable)); + IsRuntimeUninitialized)); ASSERT0( LT_FUSSY, - (GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable)) < + (GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized)) < (GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable)), + IsRuntimeUninitialized)), "current thread is not needed here; error"); // copy @@ -135,9 +137,9 @@ public: case kmp_sched_static_chunk: { if (chunk > 0) { entityId = - GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized); numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable); + IsRuntimeUninitialized); ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities); break; @@ -145,9 +147,9 @@ public: } // note: if chunk <=0, use nochunk case kmp_sched_static_nochunk: { entityId = - GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized); numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable); + IsRuntimeUninitialized); ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities); break; @@ -172,12 +174,12 @@ public: case kmp_sched_distr_static_chunk_sched_static_chunkone: { entityId = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable) * + IsRuntimeUninitialized) * GetOmpTeamId() + - GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized); numberOfEntities = GetNumberOfOmpTeams() * GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable); + IsRuntimeUninitialized); ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities); break; @@ -187,9 +189,9 @@ public: PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", schedtype); entityId = - GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable); + GetOmpThreadId(tid, IsSPMDExecutionMode, IsRuntimeUninitialized); numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable); + IsRuntimeUninitialized); ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId, numberOfEntities); } @@ -202,9 +204,12 @@ public: PRINT(LD_LOOP, "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n", GetNumberOfOmpThreads(tid, IsSPMDExecutionMode, - IsOMPRuntimeUnavailable), + IsRuntimeUninitialized), GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper), P64(*pstride)); +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_parallel); +#endif /*OMPD_SUPPORT*/ } //////////////////////////////////////////////////////////////////////////////// @@ -215,7 +220,8 @@ public: schedule <= kmp_sched_ordered_last; } - INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st, + INLINE static void dispatch_init(kmp_Indent *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, ST chunk) { int tid = GetLogicalThreadIdInBlock(); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); @@ -239,12 +245,17 @@ public: // Process schedule. if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { + if (OrderedSchedule(schedule)) { + if (isSPMDMode()) + __syncthreads(); + else + __kmpc_barrier(loc, threadId); + } PRINT(LD_LOOP, "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", (long)tnum, P64(tripCount), schedule); schedule = kmp_sched_static_chunk; chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { // process runtime omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); @@ -282,18 +293,15 @@ public: "unknown schedule %d & chunk %lld\n", schedule, P64(chunk)); } - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // init schedules if (schedule == kmp_sched_static_chunk) { ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; // save ub omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; // compute static chunk ST stride; - T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); int lastiter = 0; ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params @@ -301,8 +309,8 @@ public: omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; PRINT(LD_LOOP, - "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 "," - "next lower bound = %llu, stride = %llu\n", + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), @@ -310,11 +318,12 @@ public: } else if (schedule == kmp_sched_static_nochunk) { ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; // save ub omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; // compute static chunk ST stride; - T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); int lastiter = 0; ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); // save computed params @@ -322,45 +331,53 @@ public: omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; PRINT(LD_LOOP, - "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 "," - "next lower bound = %llu, stride = %llu\n", + "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), omptarget_nvptx_threadPrivateContext->Stride(tid)); } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - if (chunk < 1) - chunk = 1; - Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks - // but each thread (but one) must discover that it is last - eventNum += tnum; - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum; + if (isSPMDMode()) + __syncthreads(); + else + __kmpc_barrier(loc, threadId); + // save sched state + int teamId = GetOmpTeamId(); + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + if (GetThreadIdInBlock() == 0) { + if (chunk < 1) + chunk = 1; + omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub; + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb; + } + if (isSPMDMode()) + __syncthreads(); + else + __kmpc_barrier(loc, threadId); PRINT(LD_LOOP, - "dispatch init (dyn) : num threads = %d, ub = %" PRId64 ", chunk %" PRIu64 ", " - "events number = %llu\n", + "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 + ", chunk %" PRIu64 "\n", GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId), + omptarget_nvptx_threadPrivateContext->Chunk(teamId)); } +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_parallel); +#endif /*OMPD_SUPPORT*/ } //////////////////////////////////////////////////////////////////////////////// // Support for dispatch next - INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg, - Counter priv, T &lb, T &ub, - Counter &chunkId, Counter ¤tEvent, - T chunkSize, T loopUpperBound) { - // get next event atomically - Counter nextEvent = cg.Next(); - // calculate chunk Id (priv was initialized upon entering the loop to - // 'start' == 'event') - chunkId = nextEvent - priv; + INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, + Counter &loopLowerBound, + T loopUpperBound) { // calculate lower bound for all lanes in the warp - lb = chunkId * chunkSize; // this code assume normalization of LB + lb = atomicAdd(&loopLowerBound, (Counter)chunkSize); ub = lb + chunkSize - 1; // Clang uses i <= ub // 3 result cases: @@ -368,9 +385,8 @@ public: // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> // NOT_FINISHED // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - currentEvent = nextEvent; // a. - if (ub <= loopUpperBound) { + if (lb <= loopUpperBound && ub < loopUpperBound) { PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb), P64(ub), P64(loopUpperBound)); return NOT_FINISHED; @@ -383,7 +399,8 @@ public: return LAST_CHUNK; } // c. if we are here, we are in case 'c' - lb = loopUpperBound + 1; + lb = loopUpperBound + 2; + ub = loopUpperBound + 1; PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb), P64(ub), P64(loopUpperBound)); return FINISHED; @@ -437,29 +454,18 @@ public: ASSERT0(LT_FUSSY, schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, "bad sched"); - omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); T myLb, myUb; - Counter chunkId; - // xxx current event is now local - omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup(); + int teamId = GetOmpTeamId(); int finished = DynamicNextChunk( - cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb, - chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); - - if (finished == FINISHED) { - cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid), - omptarget_nvptx_threadPrivateContext->EventsNumber(tid)); - cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid), - omptarget_nvptx_threadPrivateContext->CurrentEvent(tid)); + myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId), + omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId)); + if (finished == FINISHED) return DISPATCH_FINISHED; - } // not finished (either not finished or last chunk) - *plast = (int32_t)( - myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + *plast = (int32_t)(finished == LAST_CHUNK); *plower = myLb; *pupper = myUb; *pstride = 1; @@ -474,6 +480,9 @@ public: INLINE static void dispatch_fini() { // nothing +#ifdef OMP_SUPPORT + ompd_reset_device_thread_state() +#endif } //////////////////////////////////////////////////////////////////////////////// @@ -491,7 +500,7 @@ EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, @@ -499,7 +508,7 @@ EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid, int32_t st, int32_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, @@ -507,7 +516,7 @@ EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, @@ -515,7 +524,7 @@ EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid, int64_t st, int64_t chunk) { PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); omptarget_nvptx_LoopSupport::dispatch_init( - (kmp_sched_t)schedule, lb, ub, st, chunk); + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); } // next @@ -581,7 +590,8 @@ EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid, int32_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), + isRuntimeUninitialized()); } EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, @@ -591,7 +601,8 @@ EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid, int32_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), + isRuntimeUninitialized()); } EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, @@ -601,7 +612,8 @@ EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid, int64_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), + isRuntimeUninitialized()); } EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, @@ -611,7 +623,8 @@ EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid, int64_t chunk) { PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); omptarget_nvptx_LoopSupport::for_static_init( - schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode()); + schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode(), + isRuntimeUninitialized()); } EXTERN @@ -623,8 +636,8 @@ void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid, PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/true, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/true, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -636,8 +649,8 @@ void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid, PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/true, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/true, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -649,8 +662,8 @@ void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid, PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/true, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/true, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -662,8 +675,8 @@ void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid, PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/true, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/true, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -674,8 +687,8 @@ void __kmpc_for_static_init_4_simple_generic( PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/false, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/false, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -686,8 +699,8 @@ void __kmpc_for_static_init_4u_simple_generic( PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/false, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/false, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -698,8 +711,8 @@ void __kmpc_for_static_init_8_simple_generic( PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/false, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/false, + /*IsRuntimeUninitialized=*/true); } EXTERN @@ -710,11 +723,14 @@ void __kmpc_for_static_init_8u_simple_generic( PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); omptarget_nvptx_LoopSupport::for_static_init( schedtype, plastiter, plower, pupper, pstride, chunk, - /*isSPMDExecutionMode=*/false, - /*IsOMPRuntimeUnavailable=*/true); + /*IsSPMDExecutionMode=*/false, + /*IsRuntimeUninitialized=*/true); } EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) { +#ifdef OMP_SUPPORT + ompd_reset_device_thread_state() +#endif PRINT0(LD_IO, "call kmpc_for_static_fini\n"); } diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu new file mode 100644 index 000000000..3cc18b908 --- /dev/null +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.cu @@ -0,0 +1,107 @@ +#ifdef OMPD_SUPPORT +#include "ompd-specific.h" +#include "omptarget-nvptx.h" +/** + * Declaration of symbols to hold struct size and member offset information + */ + +__device__ __shared__ static int ompd_target_initialized; + +#define ompd_target_declare_access(t,m) __device__ __shared__ uint64_t ompd_access__##t##__##m##_; +OMPD_FOREACH_ACCESS(ompd_target_declare_access) +#undef ompd_target_declare_access + +#define ompd_target_declare_sizeof_member(t,m) __device__ __shared__ uint64_t ompd_sizeof__##t##__##m##_; + OMPD_FOREACH_ACCESS(ompd_target_declare_sizeof_member) +#undef ompd_target_declare_sizeof_member + +#define ompd_target_declare_sizeof(t) __device__ __shared__ uint64_t ompd_sizeof__##t##_; + OMPD_FOREACH_SIZEOF(ompd_target_declare_sizeof) +#undef ompd_target_declare_sizeof + +__device__ __shared__ + uint64_t ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam_; + +__device__ __shared__ + uint64_t ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam_; + + +__device__ void ompd_init ( void ) +{ + if (ompd_target_initialized) + return; + +#define ompd_target_init_access(t,m) ompd_access__##t##__##m##_ = (uint64_t)&(((t*)0)->m); + OMPD_FOREACH_ACCESS(ompd_target_init_access) +#undef ompd_target_init_access + + ompd_access__omptarget_nvptx_TaskDescr__items__threadsInTeam_ = + (uint64_t)&(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam); + +#define ompd_target_init_sizeof_member(t,m) ompd_sizeof__##t##__##m##_ = sizeof(((t*)0)->m); + OMPD_FOREACH_ACCESS(ompd_target_init_sizeof_member) +#undef ompd_target_init_sizeof_member + + ompd_sizeof__omptarget_nvptx_TaskDescr__items__threadsInTeam_ = + (uint64_t)sizeof(((omptarget_nvptx_TaskDescr*)0)->items.threadsInTeam); + +#define ompd_target_init_sizeof(t) ompd_sizeof__##t##_ = sizeof(t); + OMPD_FOREACH_SIZEOF(ompd_target_init_sizeof) +#undef ompd_target_init_sizeof + + omptarget_nvptx_threadPrivateContext->ompd_levelZeroParallelInfo.level = 0; + if (isSPMDMode()) { + omptarget_nvptx_threadPrivateContext->teamContext.levelZeroTaskDescr + .ompd_thread_info.enclosed_parallel.parallel_tasks = + &omptarget_nvptx_threadPrivateContext->levelOneTaskDescr[0]; + } else { + // generic mode + omptarget_nvptx_threadPrivateContext->ompd_levelZeroParallelInfo + .parallel_tasks = &omptarget_nvptx_threadPrivateContext->teamContext + .levelZeroTaskDescr; + } + + ompd_target_initialized = 1; +} + +INLINE void ompd_init_thread(omptarget_nvptx_TaskDescr *currTaskDescr, + void *task_func, uint8_t implicit) { + currTaskDescr->ompd_thread_info.blockIdx_x = blockIdx.x; + currTaskDescr->ompd_thread_info.threadIdx_x = threadIdx.x; + currTaskDescr->ompd_thread_info.task_function = task_func; + currTaskDescr->ompd_thread_info.task_implicit = implicit; +} + +__device__ void ompd_set_device_specific_thread_state( + omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state) { + taskDescr->ompd_thread_info.state = state; +} + +__device__ void ompd_set_device_thread_state(omp_state_t state) { + ompd_set_device_specific_thread_state(getMyTopTaskDescriptor(), state); +} + +__device__ void ompd_init_thread_parallel() { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ompd_init_thread(currTaskDescr, omptarget_nvptx_workFn, 1); + ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_parallel); +} + +__device__ void ompd_init_thread_master() { + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + ompd_init_thread(currTaskDescr, NULL, 1); + ompd_set_device_specific_thread_state(currTaskDescr, omp_state_work_serial); +} + +__device__ void ompd_init_explicit_task(void *task_func) { + omptarget_nvptx_TaskDescr *taskDescr = getMyTopTaskDescriptor(); + ompd_init_thread(taskDescr, task_func, 0); +} + +__device__ void ompd_bp_parallel_begin (){ asm (""); } +__device__ void ompd_bp_parallel_end (){ asm (""); } +__device__ void ompd_bp_task_begin (){ asm (""); } +__device__ void ompd_bp_task_end (){ asm (""); } +__device__ void ompd_bp_thread_begin (){ asm (""); } +__device__ void ompd_bp_thread_end (){ asm (""); } +#endif /* OMPD_SUPPORT */ diff --git a/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h new file mode 100644 index 000000000..8b929e5fe --- /dev/null +++ b/libomptarget/deviceRTLs/nvptx/src/ompd-specific.h @@ -0,0 +1,94 @@ +#ifndef __OMPD_SPECIFIC_H__ +#define __OMPD_SPECIFIC_H__ + +#ifdef OMPD_SUPPORT + +#include "state-queue.h" +#include "option.h" +#include + + + +__device__ void ompd_init( void ); +extern "C" __device__ void ompd_bp_parallel_begin ( void ); +extern "C" __device__ void ompd_bp_parallel_end ( void ); +extern "C" __device__ void ompd_bp_task_begin ( void ); +extern "C" __device__ void ompd_bp_task_end ( void ); +extern "C" __device__ void ompd_bp_thread_begin ( void ); +extern "C" __device__ void ompd_bp_thread_end ( void ); + + +#define OMPD_FOREACH_ACCESS(OMPD_ACCESS) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext, topTaskDescr) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,teamContext) \ + OMPD_ACCESS(omptarget_nvptx_ThreadPrivateContext,ompd_levelZeroParallelInfo) \ + OMPD_ACCESS(omptarget_nvptx_TaskDescr,ompd_thread_info) \ + OMPD_ACCESS(omptarget_nvptx_TaskDescr,prev) \ + OMPD_ACCESS(omptarget_nvptx_TeamDescr,levelZeroTaskDescr) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,state) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,threadIdx_x) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,enclosed_parallel) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,task_function) \ + OMPD_ACCESS(ompd_nvptx_thread_info_t,task_implicit) \ + OMPD_ACCESS(ompd_nvptx_parallel_info_t,level) \ + OMPD_ACCESS(ompd_nvptx_parallel_info_t,parallel_tasks) + + +#define OMPD_FOREACH_SIZEOF(OMPD_SIZEOF) \ + OMPD_SIZEOF(omptarget_nvptx_ThreadPrivateContext)\ + OMPD_SIZEOF(omptarget_nvptx_TaskDescr) \ + OMPD_SIZEOF(ompd_nvptx_thread_info_t) + + +/* we only support work states for the moment */ +typedef enum { + omp_state_undefined = 0x102, + omp_state_work_serial = 0x000, + omp_state_work_parallel = 0x001, + omp_state_work_reduction = 0x002 +} omp_state_t; + +class omptarget_nvptx_TaskDescr; + +__device__ void ompd_init_thread_master(); +__device__ void ompd_set_device_specific_thread_state( + omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state); +__device__ void ompd_set_device_thread_state(omp_state_t state); +__device__ void ompd_init_thread_parallel(); +__device__ void ompd_init_explicit_task(void *task_func); + +INLINE void ompd_reset_device_thread_state() { + ompd_set_device_thread_state(omp_state_work_serial); +} + +/* We store parallel info in the threadPrivateContext the same way that task + * descriptors are stored. Currently there is no support for nested + * parallelism (TODO: there will probably be in the future), so we store one + * parallel descriptor in the threadPrivateContext for the outermost parallel + * region and additonally one descriptor in each thread in case of serialized + * inner parallel regions + */ +typedef struct { + uint16_t level; + /* If level = 0, parallel_tasks points just to the master task descriptor + * if level = 1, parallel_tasks points to threadPrivateContext->levelOneTaskDescr + * if level > 1, we are in a serialized parallel region and parallel_tasks points + * to the single task in the parallel region. + */ + omptarget_nvptx_TaskDescr *parallel_tasks; +} ompd_nvptx_parallel_info_t; + +typedef struct { + uint64_t state; // In the host runtime we use the OMPT state. + // Here we need to have our own place to store it. + uint16_t blockIdx_x; // Libomptarget should only schedule task in one dimension. + // To store a unique identifier for the current thread, we + // simply store ThreadIdx.x and BlockIdx.x + uint16_t threadIdx_x; + ompd_nvptx_parallel_info_t enclosed_parallel; + void *task_function; + uint8_t task_implicit; +} ompd_nvptx_thread_info_t; + +#endif /* OMPD_SUPPORT */ +#endif diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index 4a8610403..f3202a2bb 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -92,6 +92,11 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); currTaskDescr->NThreads() = GetNumberOfWorkersInTeam(); currTaskDescr->ThreadLimit() = ThreadLimit; +#ifdef OMPD_SUPPORT + ompd_init(); + ompd_init_thread_master(); + ompd_bp_thread_begin(); +#endif /*OMPD_SUPPORT*/ } EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { @@ -105,6 +110,9 @@ EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { omptarget_nvptx_device_State[slot].Enqueue( omptarget_nvptx_threadPrivateContext); } +#ifdef OMPD_SUPPORT + ompd_bp_thread_end(); +#endif // Done with work. Kill the workers. omptarget_nvptx_workFn = 0; } @@ -138,6 +146,11 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, currTeamDescr.InitTeamDescr(); // init counters (copy start to init) workDescr.CounterGroup().Reset(); +#ifdef OMPD_SUPPORT + ompd_init(); + ompd_bp_parallel_begin(); // This should be placed later, but the parallel + // handle is ready from here on. +#endif /*OMPD_SUPPORT*/ } __syncthreads(); @@ -173,17 +186,33 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, DataSharingState.SlotPtr[WID] = RootS; DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; } +#ifdef OMPD_SUPPORT + ompd_init_thread_parallel(); // __kmpc_kernel_parallel() is not called in + // spmd mode + ompd_bp_thread_begin(); +#endif } EXTERN void __kmpc_spmd_kernel_deinit() { // We're not going to pop the task descr stack of each thread since // there are no more parallel regions in SPMD mode. __syncthreads(); +#ifdef OMPD_SUPPORT + ompd_bp_thread_end(); +#endif int threadId = GetThreadIdInBlock(); if (threadId == 0) { +#ifdef OMPD_SUPPORT + ompd_bp_parallel_end(); +#endif // Enqueue omp state object for use by another team. int slot = smid() % MAX_SM; omptarget_nvptx_device_State[slot].Enqueue( omptarget_nvptx_threadPrivateContext); } } + +// Return true if the current target region is executed in SPMD mode. +EXTERN int8_t __kmpc_is_spmd_exec_mode() { + return isSPMDMode(); +} diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 2bc5819e6..88daa79d4 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -32,6 +32,7 @@ #include "option.h" // choices we have #include "state-queue.h" #include "support.h" +#include "ompd-specific.h" #define OMPTARGET_NVPTX_VERSION 1.1 @@ -53,13 +54,13 @@ #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 #define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down_sync((mask), (var), (delta), (width)) + __shfl_down_sync((mask), (var), (delta), (width)) #define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate)) #define __ACTIVEMASK() __activemask() #else #define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane)) #define __SHFL_DOWN_SYNC(mask, var, delta, width) \ - __shfl_down((var), (delta), (width)) + __shfl_down((var), (delta), (width)) #define __BALLOT_SYNC(mask, predicate) __ballot((predicate)) #define __ACTIVEMASK() __ballot(1) #endif @@ -150,6 +151,14 @@ extern __device__ __shared__ DataSharingStateTy DataSharingState; // task ICV and (implicit & explicit) task state class omptarget_nvptx_TaskDescr { +#if OMPD_SUPPORT + friend void __device__ ompd_init( void ); + friend INLINE void ompd_init_thread( + omptarget_nvptx_TaskDescr *currTaskDescr, void *task_func, + uint8_t implicit); + friend __device__ void ompd_set_device_specific_thread_state( + omptarget_nvptx_TaskDescr *taskDescr, omp_state_t state); +#endif /* OMPD_SUPPORT */ public: // methods for flags INLINE omp_sched_t GetRuntimeSched(); @@ -192,6 +201,11 @@ class omptarget_nvptx_TaskDescr { INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum); +#ifdef OMPD_SUPPORT + INLINE ompd_nvptx_thread_info_t *ompd_ThreadInfo() { + return &ompd_thread_info; + } +#endif private: // bits for flags: (7 used, 1 free) @@ -216,6 +230,9 @@ class omptarget_nvptx_TaskDescr { uint16_t threadsInTeam; // threads in current team uint64_t runtimeChunkSize; // runtime chunk size } items; +#ifdef OMPD_SUPPORT + ompd_nvptx_thread_info_t ompd_thread_info; +#endif omptarget_nvptx_TaskDescr *prev; }; @@ -247,6 +264,9 @@ class omptarget_nvptx_WorkDescr { //////////////////////////////////////////////////////////////////////////////// class omptarget_nvptx_TeamDescr { +#ifdef OMPD_SUPPORT + friend void __device__ ompd_init( void ); +#endif /*OMPD_SUPPORT*/ public: // access to data INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { @@ -310,6 +330,9 @@ class omptarget_nvptx_TeamDescr { // tid refers here to the global thread id // do not support multiple concurrent kernel a this time class omptarget_nvptx_ThreadPrivateContext { +#if OMPD_SUPPORT + friend void __device__ ompd_init( void ); +#endif /* OMPD_SUPPORT */ public: // task INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { @@ -377,6 +400,10 @@ class omptarget_nvptx_ThreadPrivateContext { Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM]; // Queue to which this object must be returned. uint64_t SourceQueue; +#ifdef OMPD_SUPPORT + // The implicit parallel region around the master task in generic mode + ompd_nvptx_parallel_info_t ompd_levelZeroParallelInfo; +#endif }; /// Device envrionment data diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index d4546284f..0446d7170 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -34,6 +34,9 @@ //===----------------------------------------------------------------------===// #include "omptarget-nvptx.h" +#ifdef OMPD_SUPPORT + #include "ompd-specific.h" +#endif /*OMPD_SUPPORT*/ typedef struct ConvergentSimdJob { omptarget_nvptx_TaskDescr taskDescr; @@ -301,6 +304,20 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), "only team master can create parallel"); +#ifdef OMPD_SUPPORT + // Set ompd info for first level parallel region (this info is stored in the + // master threads task info, so it can easily be accessed + ompd_nvptx_parallel_info_t &nextPar = currTaskDescr->ompd_ThreadInfo() + ->enclosed_parallel; + nextPar.level = 1; + nextPar.parallel_tasks = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(0); + // Move the previous thread into undefined state (will be reset in __kmpc_kernel_end_parallel) + // TODO (mr) find a better place to do this + ompd_set_device_thread_state(omp_state_undefined); + ompd_bp_parallel_begin(); +#endif /*OMPD_SUPPORT*/ + // set number of threads on work descriptor // this is different from the number of cuda threads required for the parallel // region @@ -355,6 +372,10 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn, newTaskDescr->ThreadId(), newTaskDescr->NThreads()); isActive = true; +#ifdef OMPD_SUPPORT + ompd_init_thread_parallel(); + ompd_bp_thread_begin(); +#endif /*OMPD_SUPPORT*/ } return isActive; @@ -369,6 +390,13 @@ EXTERN void __kmpc_kernel_end_parallel() { omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); + ompd_bp_thread_end(); + if (threadId == 0) { + ompd_bp_parallel_end(); + } +#endif /*OMPD_SUPPORT*/ } //////////////////////////////////////////////////////////////////////////////// @@ -400,9 +428,26 @@ EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) { newTaskDescr->ThreadId() = 0; newTaskDescr->ThreadsInTeam() = 1; +#ifdef OMPD_SUPPORT + // Set ompd parallel info for the next parallel region in the previous task + // descriptor + ompd_nvptx_parallel_info_t &newPar = + currTaskDescr->ompd_ThreadInfo()->enclosed_parallel; + newPar.level = currTaskDescr->GetPrevTaskDescr() + ->ompd_ThreadInfo() + ->enclosed_parallel + .level + 1; + newPar.parallel_tasks = newTaskDescr; +#endif + // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, newTaskDescr); +#ifdef OMPD_SUPPORT + ompd_init_thread_parallel(); // we are still in a prallel region + // every thread is a parallel region.. hooray + ompd_bp_parallel_begin(); +#endif /*OMPD_SUPPORT*/ } EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, @@ -415,6 +460,9 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc, // set new top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, currTaskDescr->GetPrevTaskDescr()); +#ifdef OMPD_SUPPORT + ompd_bp_parallel_end(); +#endif // free SafeFree(currTaskDescr, (char *)"new seq parallel task"); } diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu index afa8e81eb..ac1cd8407 100644 --- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -161,6 +161,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) { + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t NumThreads = GetNumberOfOmpThreads( + BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); + if (NumThreads == 1) + return 1; /* * This reduce function handles reduction within a team. It handles * parallel regions in both L1 and L2 parallelism levels. It also @@ -171,11 +176,11 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, * 3. Warp 0 reduces to a single value. * 4. The reduced value is available in the thread that returns 1. */ +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_reduction); +#endif /*OMPD_SUPPORT*/ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; uint32_t WarpId = BlockThreadId / WARPSIZE; @@ -203,8 +208,17 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, BlockThreadId); +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ + return BlockThreadId == 0; } + +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ + return BlockThreadId == 0; #else uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true); @@ -219,10 +233,6 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, // early. return gpu_irregular_simd_reduce(reduce_data, shflFct); - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); - uint32_t NumThreads = GetNumberOfOmpThreads( - BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); - // When we have more than [warpsize] number of threads // a block reduction is performed here. // @@ -243,6 +253,10 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, return BlockThreadId == 0; } +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ + // Get the OMP thread Id. This is different from BlockThreadId in the case of // an L2 parallel region. return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode, @@ -289,6 +303,9 @@ int32_t nvptx_teams_reduce_nowait( // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. +#ifdef OMPD_SUPPORT + ompd_set_device_thread_state(omp_state_work_reduction); +#endif /*OMPD_SUPPORT*/ uint32_t NumThreads = isSPMDExecutionMode ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, @@ -403,6 +420,9 @@ int32_t nvptx_teams_reduce_nowait( } #endif // __CUDA_ARCH__ >= 700 +#ifdef OMPD_SUPPORT + ompd_reset_device_thread_state(); +#endif /*OMPD_SUPPORT*/ return ThreadId == 0; } diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu index a577d7a6c..68f08a16a 100644 --- a/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -35,40 +35,46 @@ EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) { EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) { PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); - __syncthreads(); + __kmpc_barrier(loc_ref, tid); PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); return 0; } EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) { - tid = GetLogicalThreadIdInBlock(); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); - if (!currTaskDescr->InL2OrHigherParallelRegion()) { - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + if (isSPMDMode()) { + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else if (isRuntimeUninitialized()) { + __kmpc_barrier_simple_generic(loc_ref, tid); + } else { + tid = GetLogicalThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); + if (!currTaskDescr->InL2OrHigherParallelRegion()) { + int numberOfActiveOMPThreads = + GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp (at least, all present for the kernel launch) participate in the - // barrier. This is enforced when launching the parallel region. An - // exception is when there are < WARPSIZE workers. In this case only 1 - // worker is started, so we don't need a barrier. - if (numberOfActiveOMPThreads > 1) { + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 + // worker is started, so we don't need a barrier. + if (numberOfActiveOMPThreads > 1) { #endif - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier with %d omp threads, sync parameter %d\n", - numberOfActiveOMPThreads, threads); - // Barrier #1 is for synchronization among active threads. - named_sync(L1_BARRIER, threads); + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier with %d omp threads, sync parameter %d\n", + numberOfActiveOMPThreads, threads); + // Barrier #1 is for synchronization among active threads. + named_sync(L1_BARRIER, threads); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - } // numberOfActiveOMPThreads > 1 + } // numberOfActiveOMPThreads > 1 #endif + } + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } - PRINT0(LD_SYNC, "completed kmpc_barrier\n"); } // Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/nvptx/src/task.cu index 8d4796778..76166ea8c 100644 --- a/libomptarget/deviceRTLs/nvptx/src/task.cu +++ b/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -97,7 +97,10 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); - +#ifdef OMPD_SUPPORT + ompd_init_explicit_task((void*)(newKmpTaskDescr->sub)); + ompd_bp_task_begin(); +#endif // 3. call sub PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr)); @@ -105,6 +108,10 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid, PRINT(LD_TASK, "return from call task sub 0x%llx()\n", P64(newKmpTaskDescr->sub)); +#ifdef OMPD_SUPPORT + ompd_bp_task_end(); +#endif + // 4. pop context omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, parentTaskDescr); diff --git a/libomptarget/plugins/CMakeLists.txt b/libomptarget/plugins/CMakeLists.txt index 6c24d0e1d..8c3d57168 100644 --- a/libomptarget/plugins/CMakeLists.txt +++ b/libomptarget/plugins/CMakeLists.txt @@ -37,7 +37,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") # Install plugin under the lib destination folder. install(TARGETS "omptarget.rtl.${tmachine_libname}" - LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX}) + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") target_link_libraries( "omptarget.rtl.${tmachine_libname}" diff --git a/libomptarget/plugins/cuda/CMakeLists.txt b/libomptarget/plugins/cuda/CMakeLists.txt index 8763065e7..7210eec10 100644 --- a/libomptarget/plugins/cuda/CMakeLists.txt +++ b/libomptarget/plugins/cuda/CMakeLists.txt @@ -39,7 +39,7 @@ include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) # Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX}) +install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") target_link_libraries(omptarget.rtl.cuda ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} diff --git a/libomptarget/plugins/cuda/src/rtl.cpp b/libomptarget/plugins/cuda/src/rtl.cpp index fe2f9f67c..90048a3eb 100644 --- a/libomptarget/plugins/cuda/src/rtl.cpp +++ b/libomptarget/plugins/cuda/src/rtl.cpp @@ -54,6 +54,19 @@ static int DebugLevel = 0; {} #endif +#if OMPD_SUPPORT +#ifdef __cplusplus +extern "C" { +#endif + /* TODO - Put these OMPD globals someplace cleaner */ + uint64_t ompd_num_cuda_devices; + CUcontext* ompd_CudaContextArray; +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* OMPD_SUPPORT */ + + /// Keep entries table per device. struct FuncOrGblEntryTy { __tgt_target_table Table; @@ -92,7 +105,7 @@ std::list KernelsList; /// Class containing all the device information. class RTLDeviceInfoTy { - std::vector FuncGblEntries; + std::vector> FuncGblEntries; public: int NumberOfDevices; @@ -122,7 +135,7 @@ class RTLDeviceInfoTy { void addOffloadEntry(int32_t device_id, __tgt_offload_entry entry) { assert(device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); E.Entries.push_back(entry); } @@ -131,7 +144,7 @@ class RTLDeviceInfoTy { bool findOffloadEntry(int32_t device_id, void *addr) { assert(device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); for (auto &it : E.Entries) { if (it.addr == addr) @@ -145,7 +158,7 @@ class RTLDeviceInfoTy { __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { assert(device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); int32_t size = E.Entries.size(); @@ -167,7 +180,8 @@ class RTLDeviceInfoTy { void clearOffloadEntriesTable(int32_t device_id) { assert(device_id < (int32_t)FuncGblEntries.size() && "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[device_id]; + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); E.Entries.clear(); E.Table.EntriesBegin = E.Table.EntriesEnd = 0; } @@ -204,6 +218,10 @@ class RTLDeviceInfoTy { FuncGblEntries.resize(NumberOfDevices); Contexts.resize(NumberOfDevices); +#if OMPD_SUPPORT + ompd_num_cuda_devices = (uint64_t)Contexts.size(); + ompd_CudaContextArray = &Contexts[0]; +#endif /* OMPD_SUPPORT */ ThreadsPerBlock.resize(NumberOfDevices); BlocksPerGrid.resize(NumberOfDevices); WarpSize.resize(NumberOfDevices); diff --git a/libomptarget/src/CMakeLists.txt b/libomptarget/src/CMakeLists.txt index 2d606728e..be099f309 100644 --- a/libomptarget/src/CMakeLists.txt +++ b/libomptarget/src/CMakeLists.txt @@ -28,4 +28,4 @@ target_link_libraries(omptarget "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") # Install libomptarget under the lib destination folder. -install(TARGETS omptarget LIBRARY DESTINATION lib${OPENMP_LIBDIR_SUFFIX}) +install(TARGETS omptarget LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") diff --git a/offload/Makefile b/offload/Makefile deleted file mode 100755 index 75e3744a6..000000000 --- a/offload/Makefile +++ /dev/null @@ -1,224 +0,0 @@ -# -##//===----------------------------------------------------------------------===// -#// -#// The LLVM Compiler Infrastructure -#// -#// This file is dual licensed under the MIT and the University of Illinois Open -#// Source Licenses. See LICENSE.txt for details. -#// -#//===----------------------------------------------------------------------===// -# - -# MAKEFILE PARAMETERS -# -# root_dir - path to root directory of liboffload -# build_dir - path to build directory -# mpss_dir - path to root directory of mpss -# mpss_version - version of the mpss (e.g., version "3.3.x" would be "33") -# libiomp_host_dir - path to host libiomp directory (unnecessary if compiler_host is icc) -# libiomp_target_dir - path to target libiomp directory (unnecesarry if compiler_target is icc) -# omp_header_dir - path to omp.h (unnecessary if compiler_host and compiler_target are icc) -# os_host - host operating system -# os_target - target operating system -# compiler_host - host compiler -# compiler_target - target compiler -# options_host - additional options for host compiler -# options_target - additional options for target compiler -# - -# Directories -root_dir?=. -build_dir?=$(root_dir)/build -build_host_dir=$(build_dir)/host -build_target_dir=$(build_dir)/target -obj_host_dir=$(build_dir)/obj_host -obj_target_dir=$(build_dir)/obj_target -source_dir=$(root_dir)/src -imported_dir=$(source_dir)/imported - -# OS -os_host?=linux -os_target?=linux -ifneq ($(os_host)_$(os_target), linux_linux) - $(error "Only linux is supported") -endif - -# Compilers -compiler_host?=gcc -compiler_target?=gcc - -# MPSS -mpss_version?=30 -mpss_dir?=/ -mpss_present=$(shell if test -d $(mpss_dir); then echo OK; else echo KO; fi) -ifneq ($(mpss_present), OK) - $(error "Cannot find MPSS directory $(mpss_dir)") -endif - -ifeq ($(shell test $(mpss_version) -gt 33; echo $$?), 0) - coi_dir=$(mpss_dir)/sysroots/k1om-mpss-linux/usr - coi_include=$(coi_dir)/include/intel-coi - coi_lib_host=$(mpss_dir)/lib64 - coi_lib_device=$(coi_dir)/lib64 -else - coi_dir=$(mpss_dir)/opt/intel/mic/coi - coi_include=$(coi_dir)/include - coi_lib_host=$(coi_dir)/host-linux-release/lib - coi_lib_device=$(coi_dir)/device-linux-release/lib -endif -myo_dir=$(mpss_dir)/opt/intel/mic/myo - -# Sources -src_liboffload_common=dv_util.cpp liboffload_error.c liboffload_msg.c offload_common.cpp offload_table.cpp offload_trace.cpp offload_util.cpp - -src_liboffload_host=$(src_liboffload_common) cean_util.cpp coi/coi_client.cpp compiler_if_host.cpp offload_engine.cpp offload_env.cpp offload_host.cpp offload_omp_host.cpp offload_timer_host.cpp offload_orsl.cpp orsl-lite/lib/orsl-lite.c offload_myo_host.cpp -src_liboffload_host:=$(foreach file,$(src_liboffload_host),$(source_dir)/$(file)) - -src_liboffload_target=$(src_liboffload_common) coi/coi_server.cpp compiler_if_target.cpp offload_omp_target.cpp offload_target.cpp offload_timer_target.cpp offload_myo_target.cpp -src_liboffload_target:=$(foreach file,$(src_liboffload_target),$(source_dir)/$(file)) - -src_ofld=ofldbegin.cpp ofldend.cpp -src_ofld:=$(foreach file,$(src_ofld),$(source_dir)/$(file)) - -headers=$(wildcard $(source_dir)/*.h) $(wildcard $(source_dir)/coi/*.h) $(wildcard $(source_dir)/orsl-lite/include/*.h) -ifneq ($(omp_header_dir), ) - headers+=$(imported_dir)/omp.h -endif - -# Objects -obj_liboffload_host=$(notdir $(src_liboffload_host)) -obj_liboffload_host:=$(obj_liboffload_host:.cpp=.o) -obj_liboffload_host:=$(obj_liboffload_host:.c=.o) -obj_liboffload_host:=$(foreach file,$(obj_liboffload_host),$(obj_host_dir)/$(file)) - -obj_liboffload_target=$(notdir $(src_liboffload_target)) -obj_liboffload_target:=$(obj_liboffload_target:.cpp=.o) -obj_liboffload_target:=$(obj_liboffload_target:.c=.o) -obj_liboffload_target:=$(foreach file,$(obj_liboffload_target),$(obj_target_dir)/$(file)) - -obj_ofld=$(notdir $(src_ofld)) -obj_ofld:=$(obj_ofld:.cpp=.o) -obj_ofld_host=$(foreach file,$(obj_ofld),$(build_host_dir)/$(file)) -obj_ofld_target=$(foreach file,$(obj_ofld),$(build_target_dir)/$(file)) - -# Options -opts_common=-O2 -w -fpic -c -DCOI_LIBRARY_VERSION=2 -DMYO_SUPPORT -DOFFLOAD_DEBUG=1 -DSEP_SUPPORT -DTIMING_SUPPORT -I$(coi_include) -I$(myo_dir)/include -I$(source_dir) -ifneq ($(omp_header_dir), ) - opts_common+=-I$(imported_dir) -endif - -opts_liboffload=-shared -Wl,-soname,liboffload.so.5 -ldl -lstdc++ -liomp5 - -opts_liboffload_host=$(opts_liboffload) -L$(coi_lib_host) -lcoi_host -L$(myo_dir)/lib -lmyo-client -ifneq ($(libiomp_host_dir), ) - opts_liboffload_host+=-L$(libiomp_host_dir) -endif - -opts_liboffload_target=$(opts_liboffload) -L$(coi_lib_device) -lcoi_device -L$(myo_dir)/lib -lmyo-service -ifneq ($(libiomp_target_dir), ) - opts_liboffload_target+=-L$(libiomp_target_dir) -endif - -options_host?= -opts_host=$(options_host) -DHOST_LIBRARY=1 -DMPSS_VERSION=$(mpss_version) -ifeq ($(os_host), linux) - opts_host+=-DLINUX -endif - -options_target?= -opts_target=$(options_target) -DHOST_LIBRARY=0 -ifeq ($(os_target), linux) - opts_target+=-DLINUX -endif -ifeq ($(compiler_target), icc) - opts_target+=-mmic -endif - -# Make targets -.PHONY: all clean info - -all: info $(build_host_dir)/liboffload.so $(build_target_dir)/liboffload.so $(obj_ofld_host) $(obj_ofld_target) - - -$(build_host_dir)/liboffload.so: $(build_host_dir)/liboffload.so.5 | $(build_host_dir) - ln -f $< $@ - -$(build_host_dir)/liboffload.so.5: $(obj_liboffload_host) | $(build_host_dir) - $(compiler_host) $(opts_liboffload_host) $(opts_host) $^ -o $@ - -$(obj_host_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_host_dir) - $(compiler_host) $(opts_common) $(opts_host) $< -o $@ - -$(obj_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_host_dir) - $(compiler_host) $(opts_common) $(opts_host) $< -o $@ - -$(obj_host_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_host_dir) - $(compiler_host) $(opts_common) $(opts_host) $< -o $@ - -$(obj_host_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_host_dir) - $(compiler_host) $(opts_common) $(opts_host) $< -o $@ - - -$(build_target_dir)/liboffload.so: $(build_target_dir)/liboffload.so.5 | $(build_target_dir) - ln -f $< $@ - -$(build_target_dir)/liboffload.so.5: $(obj_liboffload_target) | $(build_target_dir) - $(compiler_target) $(opts_liboffload_target) $(opts_target) $^ -o $@ - -$(obj_target_dir)/%.o: $(source_dir)/%.c $(headers) | $(obj_target_dir) - $(compiler_target) $(opts_common) $(opts_target) $< -o $@ - -$(obj_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(obj_target_dir) - $(compiler_target) $(opts_common) $(opts_target) $< -o $@ - -$(obj_target_dir)/%.o: $(source_dir)/coi/%.cpp $(headers) | $(obj_target_dir) - $(compiler_target) $(opts_common) $(opts_target) $< -o $@ - -$(obj_target_dir)/%.o: $(source_dir)/orsl-lite/lib/%.c $(headers) | $(obj_target_dir) - $(compiler_target) $(opts_common) $(opts_target) $< -o $@ - - -$(build_host_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_host_dir) - $(compiler_host) $(opts_common) $(opts_host) $< -o $@ - -$(build_target_dir)/%.o: $(source_dir)/%.cpp $(headers) | $(build_target_dir) - $(compiler_target) $(opts_common) $(opts_target) $< -o $@ - - -$(imported_dir)/omp.h: $(omp_header_dir)/omp.h | $(imported_dir) - cp $< $@ - - -$(build_host_dir) $(build_target_dir) $(obj_host_dir) $(obj_target_dir): | $(build_dir) - $(shell mkdir -p $@ >/dev/null 2>/dev/null) - @echo "Created $@ directory" - -$(build_dir): - $(shell mkdir -p $@ >/dev/null 2>/dev/null) - @echo "Created $@ directory" - -$(imported_dir): - $(shell mkdir -p $@ >/dev/null 2>/dev/null) - @echo "Created $@ directory" - - -clean: - $(shell rm -rf $(build_dir)) - @echo "Remove $(build_dir) directory" - - -info: - @echo "root_dir = $(root_dir)" - @echo "build_dir = $(build_dir)" - @echo "mpss_dir = $(mpss_dir)" - @echo "mpss_version = $(mpss_version)" - @echo "libiomp_host_dir = $(libiomp_host_dir)" - @echo "libiomp_target_dir = $(libiomp_target_dir)" - @echo "omp_header_dir = $(omp_header_dir)" - @echo "os_host = $(os_host)" - @echo "os_target = $(os_target)" - @echo "compiler_host = $(compiler_host)" - @echo "compiler_target = $(compiler_target)" - @echo "options_host = $(options_host)" - @echo "options_target = $(options_target)" - diff --git a/offload/README.txt b/offload/README.txt deleted file mode 100755 index eb9fb1da3..000000000 --- a/offload/README.txt +++ /dev/null @@ -1,129 +0,0 @@ - - README for Intel(R) Offload Runtime Library - =========================================== - -How to Build Documentation -========================== - -The main documentation is in Doxygen* format, and this distribution -should come with pre-built PDF documentation in doc/Reference.pdf. -However, an HTML version can be built by executing: - -% doxygen doc/doxygen/config - -in this directory. - -That will produce HTML documentation in the doc/doxygen/generated -directory, which can be accessed by pointing a web browser at the -index.html file there. - -If you don't have Doxygen installed, you can download it from -www.doxygen.org. - - -Software Requirements -===================== - -Intel(R) Offload Runtime Library requires additional software: - -1) Intel(R) OpenMP* Runtime Library. You can either download the source -code for that (from openmprtl.org or openmp.llvm.org) or simply use the -compiled version distributed with the Intel compilers. -2) Intel(R) COI Runtime Library and Intel(R) MYO Runtime Library. These -libraries are part of Intel(R) Manycore Platform Software Stack (MPSS). You -can download MPSS source code or binaries from -software.intel.com/en-us/articles/intel-manycore-platform-software-stack-mpss. -Binaries include host libraries for Intel(R) 64 Architecture and target -libraries for Intel(R) Many Integrated Core Architecture. - -Also you will require all of the libraries that enable the target code to run -on device. If you target the Intel(R) Xeon Phi (TM) coprocessor, these -libraries can be taken from MPSS too. - - -How to Build the Intel(R) Offload Runtime Library -================================================= - -The Makefile at the top-level will attempt to detect what it needs to -build the Intel(R) Offload Runtime Library. To see the default settings, -type: - -make info - -You can change the Makefile's behavior with the following options: - -root_dir: The path to the top-level directory containing the - top-level Makefile. By default, this will take on the - value of the current working directory. - -build_dir: The path to the build directory. By default, this will - take on value [root_dir]/build. - -mpss_dir: The path to the Intel(R) Manycore Platform Software - Stack install directory. By default, this will take on - the value of operating system's root directory. - -libiomp_host_dir: The path to the host Intel(R) OpenMP* Runtime Library. - This option is required when the host compiler is other - than icc. - -libiomp_target_dir: The path to the target Intel(R) OpenMP* Runtime - Library. This option is required when the target - compiler is other than icc. - -omp_header_dir: The path to the header file of Intel(R) OpenMP* - Runtime Library. This option is required if either host - or target compiler is other than icc. - -os_host: Operating system on host. Currently supports only - "linux" which is set by default. - -os_target: Operating system on target device. Currently supports - only "linux" which is set by default. - -compiler_host: Which compiler to use for the build of the host part. - Defaults to "gcc"*. Also supports "icc" and "clang"*. - You should provide the full path to the compiler or it - should be in the user's path. - -compiler_host: Which compiler to use for the build of the target part. - Defaults to "gcc"*. Also supports "icc" and "clang"*. - You should provide the full path to the compiler or it - should be in the user's path. - -options_host: Additional options for the host compiler. - -options_target: Additional options for the target compiler. - -To use any of the options above, simple add =. For -example, if you want to build with icc instead of gcc, type: - -make compiler_host=icc compiler_target=icc - - -Supported RTL Build Configurations -================================== - -Supported Architectures: Intel(R) 64, and Intel(R) Many Integrated -Core Architecture - - --------------------------------------------- - | icc/icl | gcc | clang | ---------------|---------------|---------------------------| -| Linux* OS | Yes | Yes(1) | Yes(1) | -| OS X* | No | No | No | -| Windows* OS | No | No | No | ------------------------------------------------------------ - -(1) Liboffload requires _rdtsc intrinsic, which may be unsupported by some - versions of compiler. In this case you need to include src/rdtsc.h - manually by using Makefile options options_host and options_target: - - make options_host="-include src/rdtsc.h" options_target="-include src/rdtsc.h" - ------------------------------------------------------------------------ - -Notices -======= - -*Other names and brands may be claimed as the property of others. diff --git a/offload/doc/Reference.pdf b/offload/doc/Reference.pdf deleted file mode 100644 index b9176f07f..000000000 Binary files a/offload/doc/Reference.pdf and /dev/null differ diff --git a/offload/doc/doxygen/config b/offload/doc/doxygen/config deleted file mode 100755 index 275258f76..000000000 --- a/offload/doc/doxygen/config +++ /dev/null @@ -1,2328 +0,0 @@ -# Doxyfile 1.8.6 - -# This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project. -# -# All text after a double hash (##) is considered a comment and is placed in -# front of the TAG it is preceding. -# -# All text after a single hash (#) is considered a comment and will be ignored. -# The format is: -# TAG = value [value, ...] -# For lists, items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (\" \"). - -#--------------------------------------------------------------------------- -# Project related configuration options -#--------------------------------------------------------------------------- - -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. -# The default value is: UTF-8. - -DOXYFILE_ENCODING = UTF-8 - -# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by -# double-quotes, unless you are using Doxywizard) that should identify the -# project for which the documentation is generated. This name is used in the -# title of most generated pages and in a few other places. -# The default value is: My Project. - -PROJECT_NAME = "Intel® Offload Runtime Library" - -# The PROJECT_NUMBER tag can be used to enter a project or revision number. This -# could be handy for archiving the generated documentation or if some version -# control system is used. - -PROJECT_NUMBER = - -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer a -# quick idea about the purpose of the project. Keep the description short. - -PROJECT_BRIEF = - -# With the PROJECT_LOGO tag one can specify an logo or icon that is included in -# the documentation. The maximum height of the logo should not exceed 55 pixels -# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo -# to the output directory. - -PROJECT_LOGO = - -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path -# into which the generated documentation will be written. If a relative path is -# entered, it will be relative to the location where doxygen was started. If -# left blank the current directory will be used. - -OUTPUT_DIRECTORY = doc/doxygen/generated - -# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this -# option can be useful when feeding doxygen a huge amount of source files, where -# putting all generated files in the same directory would otherwise causes -# performance problems for the file system. -# The default value is: NO. - -CREATE_SUBDIRS = NO - -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. -# The default value is: English. - -OUTPUT_LANGUAGE = English - -# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member -# descriptions after the members that are listed in the file and class -# documentation (similar to Javadoc). Set to NO to disable this. -# The default value is: YES. - -BRIEF_MEMBER_DESC = YES - -# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief -# description of a member or function before the detailed description -# -# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the -# brief descriptions will be completely suppressed. -# The default value is: YES. - -REPEAT_BRIEF = YES - -# This tag implements a quasi-intelligent brief description abbreviator that is -# used to form the text in various listings. Each string in this list, if found -# as the leading text of the brief description, will be stripped from the text -# and the result, after processing the whole list, is used as the annotated -# text. Otherwise, the brief description is used as-is. If left blank, the -# following values are used ($name is automatically replaced with the name of -# the entity):The $name class, The $name widget, The $name file, is, provides, -# specifies, contains, represents, a, an and the. - -ABBREVIATE_BRIEF = "The $name class" \ - "The $name widget" \ - "The $name file" \ - is \ - provides \ - specifies \ - contains \ - represents \ - a \ - an \ - the - -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# doxygen will generate a detailed section even if there is only a brief -# description. -# The default value is: NO. - -ALWAYS_DETAILED_SEC = NO - -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment -# operators of the base classes will not be shown. -# The default value is: NO. - -INLINE_INHERITED_MEMB = NO - -# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path -# before files name in the file list and in the header files. If set to NO the -# shortest path that makes the file name unique will be used -# The default value is: YES. - -FULL_PATH_NAMES = YES - -# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. -# Stripping is only done if one of the specified strings matches the left-hand -# part of the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the path to -# strip. -# -# Note that you can specify absolute paths here, but also relative paths, which -# will be relative from the directory where doxygen is started. -# This tag requires that the tag FULL_PATH_NAMES is set to YES. - -STRIP_FROM_PATH = src/ - -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the -# path mentioned in the documentation of a class, which tells the reader which -# header file to include in order to use a class. If left blank only the name of -# the header file containing the class definition is used. Otherwise one should -# specify the list of include paths that are normally passed to the compiler -# using the -I flag. - -STRIP_FROM_INC_PATH = src/ - -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but -# less readable) file names. This can be useful is your file systems doesn't -# support long names like on DOS, Mac, or CD-ROM. -# The default value is: NO. - -SHORT_NAMES = NO - -# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the -# first line (until the first dot) of a Javadoc-style comment as the brief -# description. If set to NO, the Javadoc-style will behave just like regular Qt- -# style comments (thus requiring an explicit @brief command for a brief -# description.) -# The default value is: NO. - -JAVADOC_AUTOBRIEF = NO - -# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first -# line (until the first dot) of a Qt-style comment as the brief description. If -# set to NO, the Qt-style will behave just like regular Qt-style comments (thus -# requiring an explicit \brief command for a brief description.) -# The default value is: NO. - -QT_AUTOBRIEF = NO - -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a -# multi-line C++ special comment block (i.e. a block of //! or /// comments) as -# a brief description. This used to be the default behavior. The new default is -# to treat a multi-line C++ comment block as a detailed description. Set this -# tag to YES if you prefer the old behavior instead. -# -# Note that setting this tag to YES also means that rational rose comments are -# not recognized any more. -# The default value is: NO. - -MULTILINE_CPP_IS_BRIEF = NO - -# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the -# documentation from any documented member that it re-implements. -# The default value is: YES. - -INHERIT_DOCS = YES - -# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a -# new page for each member. If set to NO, the documentation of a member will be -# part of the file/class/namespace that contains it. -# The default value is: NO. - -SEPARATE_MEMBER_PAGES = NO - -# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen -# uses this value to replace tabs by spaces in code fragments. -# Minimum value: 1, maximum value: 16, default value: 4. - -TAB_SIZE = 8 - -# This tag can be used to specify a number of aliases that act as commands in -# the documentation. An alias has the form: -# name=value -# For example adding -# "sideeffect=@par Side Effects:\n" -# will allow you to put the command \sideeffect (or @sideeffect) in the -# documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. - -ALIASES = - -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources -# only. Doxygen will then generate output that is more tailored for C. For -# instance, some of the names that are used will be different. The list of all -# members will be omitted, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_FOR_C = NO - -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or -# Python sources only. Doxygen will then generate output that is more tailored -# for that language. For instance, namespaces will be presented as packages, -# qualified scopes will look different, etc. -# The default value is: NO. - -OPTIMIZE_OUTPUT_JAVA = NO - -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources. Doxygen will then generate output that is tailored for Fortran. -# The default value is: NO. - -OPTIMIZE_FOR_FORTRAN = NO - -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for VHDL. -# The default value is: NO. - -OPTIMIZE_OUTPUT_VHDL = NO - -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given -# extension. Doxygen has a built-in mapping, but you can override or extend it -# using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make -# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C -# (default is Fortran), use: inc=Fortran f=C. -# -# Note For files without extension you can use no_extension as a placeholder. -# -# Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments -# according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. -# The output of markdown processing is further processed by doxygen, so you can -# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in -# case of backward compatibilities issues. -# The default value is: YES. - -MARKDOWN_SUPPORT = YES - -# When enabled doxygen tries to link words that correspond to documented -# classes, or namespaces to their corresponding documentation. Such a link can -# be prevented in individual cases by by putting a % sign in front of the word -# or globally by setting AUTOLINK_SUPPORT to NO. -# The default value is: YES. - -AUTOLINK_SUPPORT = YES - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should set this -# tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); -# versus func(std::string) {}). This also make the inheritance and collaboration -# diagrams that involve STL classes more complete and accurate. -# The default value is: NO. - -BUILTIN_STL_SUPPORT = NO - -# If you use Microsoft's C++/CLI language, you should set this option to YES to -# enable parsing support. -# The default value is: NO. - -CPP_CLI_SUPPORT = NO - -# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen -# will parse them like normal C++ but will assume all classes use public instead -# of private inheritance when no explicit protection keyword is present. -# The default value is: NO. - -SIP_SUPPORT = NO - -# For Microsoft's IDL there are propget and propput attributes to indicate -# getter and setter methods for a property. Setting this option to YES will make -# doxygen to replace the get and set methods by a property in the documentation. -# This will only work if the methods are indeed getting or setting a simple -# type. If this is not the case, or you want to show the methods anyway, you -# should set this option to NO. -# The default value is: YES. - -IDL_PROPERTY_SUPPORT = YES - -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES, then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default -# all members of a group must be documented explicitly. -# The default value is: NO. - -DISTRIBUTE_GROUP_DOC = NO - -# Set the SUBGROUPING tag to YES to allow class member groups of the same type -# (for instance a group of public functions) to be put as a subgroup of that -# type (e.g. under the Public Functions section). Set it to NO to prevent -# subgrouping. Alternatively, this can be done per class using the -# \nosubgrouping command. -# The default value is: YES. - -SUBGROUPING = YES - -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions -# are shown inside the group in which they are included (e.g. using \ingroup) -# instead of on a separate page (for HTML and Man pages) or section (for LaTeX -# and RTF). -# -# Note that this feature does not work in combination with -# SEPARATE_MEMBER_PAGES. -# The default value is: NO. - -INLINE_GROUPED_CLASSES = NO - -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions -# with only public data fields or simple typedef fields will be shown inline in -# the documentation of the scope in which they are defined (i.e. file, -# namespace, or group documentation), provided this scope is documented. If set -# to NO, structs, classes, and unions are shown on a separate page (for HTML and -# Man pages) or section (for LaTeX and RTF). -# The default value is: NO. - -INLINE_SIMPLE_STRUCTS = NO - -# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or -# enum is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically be -# useful for C code in case the coding convention dictates that all compound -# types are typedef'ed and only the typedef is referenced, never the tag name. -# The default value is: NO. - -TYPEDEF_HIDES_STRUCT = NO - -# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This -# cache is used to resolve symbols given their name and scope. Since this can be -# an expensive process and often the same symbol appears multiple times in the -# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small -# doxygen will become slower. If the cache is too large, memory is wasted. The -# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range -# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 -# symbols. At the end of a run doxygen will report the cache usage and suggest -# the optimal cache size from a speed point of view. -# Minimum value: 0, maximum value: 9, default value: 0. - -LOOKUP_CACHE_SIZE = 0 - -#--------------------------------------------------------------------------- -# Build related configuration options -#--------------------------------------------------------------------------- - -# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in -# documentation are documented, even if no documentation was available. Private -# class members and static file members will be hidden unless the -# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. -# Note: This will also disable the warnings about undocumented members that are -# normally produced when WARNINGS is set to YES. -# The default value is: NO. - -EXTRACT_ALL = YES - -# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will -# be included in the documentation. -# The default value is: NO. - -EXTRACT_PRIVATE = YES - -# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal -# scope will be included in the documentation. -# The default value is: NO. - -EXTRACT_PACKAGE = YES - -# If the EXTRACT_STATIC tag is set to YES all static members of a file will be -# included in the documentation. -# The default value is: NO. - -EXTRACT_STATIC = YES - -# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined -# locally in source files will be included in the documentation. If set to NO -# only classes defined in header files are included. Does not have any effect -# for Java sources. -# The default value is: YES. - -EXTRACT_LOCAL_CLASSES = YES - -# This flag is only useful for Objective-C code. When set to YES local methods, -# which are defined in the implementation section but not in the interface are -# included in the documentation. If set to NO only methods in the interface are -# included. -# The default value is: NO. - -EXTRACT_LOCAL_METHODS = NO - -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base name of -# the file that contains the anonymous namespace. By default anonymous namespace -# are hidden. -# The default value is: NO. - -EXTRACT_ANON_NSPACES = NO - -# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all -# undocumented members inside documented classes or files. If set to NO these -# members will be included in the various overviews, but no documentation -# section is generated. This option has no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_MEMBERS = NO - -# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. If set -# to NO these classes will be included in the various overviews. This option has -# no effect if EXTRACT_ALL is enabled. -# The default value is: NO. - -HIDE_UNDOC_CLASSES = NO - -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO these declarations will be -# included in the documentation. -# The default value is: NO. - -HIDE_FRIEND_COMPOUNDS = NO - -# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any -# documentation blocks found inside the body of a function. If set to NO these -# blocks will be appended to the function's detailed documentation block. -# The default value is: NO. - -HIDE_IN_BODY_DOCS = NO - -# The INTERNAL_DOCS tag determines if documentation that is typed after a -# \internal command is included. If the tag is set to NO then the documentation -# will be excluded. Set it to YES to include the internal documentation. -# The default value is: NO. - -INTERNAL_DOCS = NO - -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. -# The default value is: system dependent. - -CASE_SENSE_NAMES = NO - -# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with -# their full class and namespace scopes in the documentation. If set to YES the -# scope will be hidden. -# The default value is: NO. - -HIDE_SCOPE_NAMES = NO - -# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of -# the files that are included by a file in the documentation of that file. -# The default value is: YES. - -SHOW_INCLUDE_FILES = YES - - -SHOW_GROUPED_MEMB_INC = NO - -# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include -# files with double quotes in the documentation rather than with sharp brackets. -# The default value is: NO. - -FORCE_LOCAL_INCLUDES = NO - -# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the -# documentation for inline members. -# The default value is: YES. - -INLINE_INFO = YES - -# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the -# (detailed) documentation of file and class members alphabetically by member -# name. If set to NO the members will appear in declaration order. -# The default value is: YES. - -SORT_MEMBER_DOCS = YES - -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief -# descriptions of file, namespace and class members alphabetically by member -# name. If set to NO the members will appear in declaration order. Note that -# this will also influence the order of the classes in the class list. -# The default value is: NO. - -SORT_BRIEF_DOCS = NO - -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the -# (brief and detailed) documentation of class members so that constructors and -# destructors are listed first. If set to NO the constructors will appear in the -# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. -# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief -# member documentation. -# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting -# detailed member documentation. -# The default value is: NO. - -SORT_MEMBERS_CTORS_1ST = NO - -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy -# of group names into alphabetical order. If set to NO the group names will -# appear in their defined order. -# The default value is: NO. - -SORT_GROUP_NAMES = NO - -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by -# fully-qualified names, including namespaces. If set to NO, the class list will -# be sorted only by class name, not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the alphabetical -# list. -# The default value is: NO. - -SORT_BY_SCOPE_NAME = NO - -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper -# type resolution of all parameters of a function it will reject a match between -# the prototype and the implementation of a member function even if there is -# only one candidate or it is obvious which candidate to choose by doing a -# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still -# accept a match between prototype and implementation in such cases. -# The default value is: NO. - -STRICT_PROTO_MATCHING = NO - -# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the -# todo list. This list is created by putting \todo commands in the -# documentation. -# The default value is: YES. - -GENERATE_TODOLIST = YES - -# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the -# test list. This list is created by putting \test commands in the -# documentation. -# The default value is: YES. - -GENERATE_TESTLIST = YES - -# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug -# list. This list is created by putting \bug commands in the documentation. -# The default value is: YES. - -GENERATE_BUGLIST = YES - -# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO) -# the deprecated list. This list is created by putting \deprecated commands in -# the documentation. -# The default value is: YES. - -GENERATE_DEPRECATEDLIST= YES - -# The ENABLED_SECTIONS tag can be used to enable conditional documentation -# sections, marked by \if ... \endif and \cond -# ... \endcond blocks. - -ENABLED_SECTIONS = - -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the -# initial value of a variable or macro / define can have for it to appear in the -# documentation. If the initializer consists of more lines than specified here -# it will be hidden. Use a value of 0 to hide initializers completely. The -# appearance of the value of individual variables and macros / defines can be -# controlled using \showinitializer or \hideinitializer command in the -# documentation regardless of this setting. -# Minimum value: 0, maximum value: 10000, default value: 30. - -MAX_INITIALIZER_LINES = 30 - -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at -# the bottom of the documentation of classes and structs. If set to YES the list -# will mention the files that were used to generate the documentation. -# The default value is: YES. - -SHOW_USED_FILES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This -# will remove the Files entry from the Quick Index and from the Folder Tree View -# (if specified). -# The default value is: YES. - -SHOW_FILES = YES - -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces -# page. This will remove the Namespaces entry from the Quick Index and from the -# Folder Tree View (if specified). -# The default value is: YES. - -SHOW_NAMESPACES = YES - -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command command input-file, where command is the value of the -# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided -# by doxygen. Whatever the program writes to standard output is used as the file -# version. For an example see the documentation. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. To create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. You can -# optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. -# -# Note that if you run doxygen from a directory containing a file called -# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE -# tag is left empty. - -LAYOUT_FILE = - -# The CITE_BIB_FILES tag can be used to specify one or more bib files containing -# the reference definitions. This must be a list of .bib files. The .bib -# extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. -# For LaTeX the style of the bibliography can be controlled using -# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the -# search path. Do not use file names with spaces, bibtex cannot handle them. See -# also \cite for info how to create references. - -CITE_BIB_FILES = - -#--------------------------------------------------------------------------- -# Configuration options related to warning and progress messages -#--------------------------------------------------------------------------- - -# The QUIET tag can be used to turn on/off the messages that are generated to -# standard output by doxygen. If QUIET is set to YES this implies that the -# messages are off. -# The default value is: NO. - -QUIET = NO - -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES -# this implies that the warnings are on. -# -# Tip: Turn warnings on while writing the documentation. -# The default value is: YES. - -WARNINGS = YES - -# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate -# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag -# will automatically be disabled. -# The default value is: YES. - -WARN_IF_UNDOCUMENTED = YES - -# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. -# The default value is: YES. - -WARN_IF_DOC_ERROR = YES - -# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that -# are documented, but have no documentation for their parameters or return -# value. If set to NO doxygen will only warn about wrong or incomplete parameter -# documentation, but not about the absence of documentation. -# The default value is: NO. - -WARN_NO_PARAMDOC = NO - -# The WARN_FORMAT tag determines the format of the warning messages that doxygen -# can produce. The string should contain the $file, $line, and $text tags, which -# will be replaced by the file and line number from which the warning originated -# and the warning text. Optionally the format may contain $version, which will -# be replaced by the version of the file (if it could be obtained via -# FILE_VERSION_FILTER) -# The default value is: $file:$line: $text. - -WARN_FORMAT = "$file:$line: $text" - -# The WARN_LOGFILE tag can be used to specify a file to which warning and error -# messages should be written. If left blank the output is written to standard -# error (stderr). - -WARN_LOGFILE = - -#--------------------------------------------------------------------------- -# Configuration options related to the input files -#--------------------------------------------------------------------------- - -# The INPUT tag is used to specify the files and/or directories that contain -# documented source files. You may enter file names like myfile.cpp or -# directories like /usr/src/myproject. Separate the files or directories with -# spaces. -# Note: If this tag is empty the current directory is searched. - -INPUT = src - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses -# libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of -# possible encodings. -# The default value is: UTF-8. - -INPUT_ENCODING = UTF-8 - -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank the -# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii, -# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, -# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, -# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, -# *.qsf, *.as and *.js. - -FILE_PATTERNS = *.c *.h *.cpp *.f90 - -# The RECURSIVE tag can be used to specify whether or not subdirectories should -# be searched for input files as well. -# The default value is: NO. - -RECURSIVE = YES - -# The EXCLUDE tag can be used to specify files and/or directories that should be -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# -# Note that relative paths are relative to the directory from which doxygen is -# run. - -EXCLUDE = src/imported src/rdtsc.h - -# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded -# from the input. -# The default value is: NO. - -EXCLUDE_SYMLINKS = NO - -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories for example use the pattern */test/* - -EXCLUDE_PATTERNS = - -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* - -EXCLUDE_SYMBOLS = - -# The EXAMPLE_PATH tag can be used to specify one or more files or directories -# that contain example code fragments that are included (see the \include -# command). - -EXAMPLE_PATH = - -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and -# *.h) to filter out the source-files in the directories. If left blank all -# files are included. - -EXAMPLE_PATTERNS = * - -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude commands -# irrespective of the value of the RECURSIVE tag. -# The default value is: NO. - -EXAMPLE_RECURSIVE = NO - -# The IMAGE_PATH tag can be used to specify one or more files or directories -# that contain images that are to be included in the documentation (see the -# \image command). - -IMAGE_PATH = - -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command: -# -# -# -# where is the value of the INPUT_FILTER tag, and is the -# name of an input file. Doxygen will then use the output that the filter -# program writes to standard output. If FILTER_PATTERNS is specified, this tag -# will be ignored. -# -# Note that the filter must not add or remove lines; it is applied before the -# code is scanned, but not when the output code is generated. If lines are added -# or removed, the anchors will not be placed correctly. - -INPUT_FILTER = - -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: pattern=filter -# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how -# filters are used. If the FILTER_PATTERNS tag is empty or if none of the -# patterns match the file name, INPUT_FILTER is applied. - -FILTER_PATTERNS = - -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER ) will also be used to filter the input files that are used for -# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). -# The default value is: NO. - -FILTER_SOURCE_FILES = NO - -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and -# it is also possible to disable source filtering for a specific pattern using -# *.ext= (so without naming a filter). -# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. - -FILTER_SOURCE_PATTERNS = - -# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that -# is part of the input, its contents will be placed on the main page -# (index.html). This can be useful if you have a project on for instance GitHub -# and want to reuse the introduction page also for the doxygen output. - -USE_MDFILE_AS_MAINPAGE = - -#--------------------------------------------------------------------------- -# Configuration options related to source browsing -#--------------------------------------------------------------------------- - -# If the SOURCE_BROWSER tag is set to YES then a list of source files will be -# generated. Documented entities will be cross-referenced with these sources. -# -# Note: To get rid of all source code in the generated output, make sure that -# also VERBATIM_HEADERS is set to NO. -# The default value is: NO. - -SOURCE_BROWSER = YES - -# Setting the INLINE_SOURCES tag to YES will include the body of functions, -# classes and enums directly into the documentation. -# The default value is: NO. - -INLINE_SOURCES = NO - -# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any -# special comment blocks from generated source code fragments. Normal C, C++ and -# Fortran comments will always remain visible. -# The default value is: YES. - -STRIP_CODE_COMMENTS = YES - -# If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. -# The default value is: NO. - -REFERENCED_BY_RELATION = YES - -# If the REFERENCES_RELATION tag is set to YES then for each documented function -# all documented entities called/used by that function will be listed. -# The default value is: NO. - -REFERENCES_RELATION = NO - -# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set -# to YES, then the hyperlinks from functions in REFERENCES_RELATION and -# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will -# link to the documentation. -# The default value is: YES. - -REFERENCES_LINK_SOURCE = YES - -# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the -# source code will show a tooltip with additional information such as prototype, -# brief description and links to the definition and documentation. Since this -# will make the HTML file larger and loading of large files a bit slower, you -# can opt to disable this feature. -# The default value is: YES. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -SOURCE_TOOLTIPS = YES - -# If the USE_HTAGS tag is set to YES then the references to source code will -# point to the HTML generated by the htags(1) tool instead of doxygen built-in -# source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version -# 4.8.6 or higher. -# -# To use it do the following: -# - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file -# - Make sure the INPUT points to the root of the source tree -# - Run doxygen as normal -# -# Doxygen will invoke htags (and that will in turn invoke gtags), so these -# tools must be available from the command line (i.e. in the search path). -# -# The result: instead of the source browser generated by doxygen, the links to -# source code will now point to the output of htags. -# The default value is: NO. -# This tag requires that the tag SOURCE_BROWSER is set to YES. - -USE_HTAGS = NO - -# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a -# verbatim copy of the header file for each class for which an include is -# specified. Set to NO to disable this. -# See also: Section \class. -# The default value is: YES. - -VERBATIM_HEADERS = YES - -# If the CLANG_ASSISTED_PARSING tag is set to YES, then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more acurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. -# Note: The availability of this option depends on whether or not doxygen was -# compiled with the --with-libclang option. -# The default value is: NO. - -CLANG_ASSISTED_PARSING = NO - -# If clang assisted parsing is enabled you can provide the compiler with command -# line options that you would normally use when invoking the compiler. Note that -# the include paths will already be set by doxygen for the files and directories -# specified with INPUT and INCLUDE_PATH. -# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. - -CLANG_OPTIONS = - -#--------------------------------------------------------------------------- -# Configuration options related to the alphabetical class index -#--------------------------------------------------------------------------- - -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all -# compounds will be generated. Enable this if the project contains a lot of -# classes, structs, unions or interfaces. -# The default value is: YES. - -ALPHABETICAL_INDEX = YES - -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -IGNORE_PREFIX = - -#--------------------------------------------------------------------------- -# Configuration options related to the HTML output -#--------------------------------------------------------------------------- - -# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output -# The default value is: YES. - -GENERATE_HTML = YES - -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a -# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of -# it. -# The default directory is: html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_OUTPUT = html - -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each -# generated HTML page (for example: .htm, .php, .asp). -# The default value is: .html. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FILE_EXTENSION = .html - -# The HTML_HEADER tag can be used to specify a user-defined HTML header file for -# each generated HTML page. If the tag is left blank doxygen will generate a -# standard header. -# -# To get valid HTML the header file that includes any scripts and style sheets -# that doxygen needs, which is dependent on the configuration options used (e.g. -# the setting GENERATE_TREEVIEW). It is highly recommended to start with a -# default header using -# doxygen -w html new_header.html new_footer.html new_stylesheet.css -# YourConfigFile -# and then modify the file new_header.html. See also section "Doxygen usage" -# for information on how to generate the default header that doxygen normally -# uses. -# Note: The header is subject to change so you typically have to regenerate the -# default header when upgrading to a newer version of doxygen. For a description -# of the possible markers and block names see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each -# generated HTML page. If the tag is left blank doxygen will generate a standard -# footer. See HTML_HEADER for more information on how to generate a default -# footer and what special commands can be used inside the footer. See also -# section "Doxygen usage" for information on how to generate the default footer -# that doxygen normally uses. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style -# sheet that is used by each HTML page. It can be used to fine-tune the look of -# the HTML output. If left blank doxygen will generate a default style sheet. -# See also section "Doxygen usage" for information on how to generate the style -# sheet that doxygen normally uses. -# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as -# it is more robust and this tag (HTML_STYLESHEET) will in the future become -# obsolete. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_STYLESHEET = - -# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user- -# defined cascading style sheet that is included after the standard style sheets -# created by doxygen. Using this option one can overrule certain style aspects. -# This is preferred over using HTML_STYLESHEET since it does not replace the -# standard style sheet and is therefor more robust against future updates. -# Doxygen will copy the style sheet file to the output directory. For an example -# see the documentation. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that the -# files will be copied as-is; there are no commands or markers available. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen -# will adjust the colors in the stylesheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value -# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 -# purple, and 360 is red again. -# Minimum value: 0, maximum value: 359, default value: 220. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_HUE = 220 - -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A -# value of 255 will produce the most vivid colors. -# Minimum value: 0, maximum value: 255, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_SAT = 100 - -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the -# luminance component of the colors in the HTML output. Values below 100 -# gradually make the output lighter, whereas values above 100 make the output -# darker. The value divided by 100 is the actual gamma applied, so 80 represents -# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not -# change the gamma. -# Minimum value: 40, maximum value: 240, default value: 80. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_COLORSTYLE_GAMMA = 80 - -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to NO can help when comparing the output of multiple runs. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_DYNAMIC_SECTIONS = NO - -# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries -# shown in the various tree structured indices initially; the user can expand -# and collapse entries dynamically later on. Doxygen will expand the tree to -# such a level that at most the specified number of entries are visible (unless -# a fully collapsed tree already exceeds this amount). So setting the number of -# entries 1 will produce a full collapsed tree by default. 0 is a special value -# representing an infinite number of entries and will result in a full expanded -# tree by default. -# Minimum value: 0, maximum value: 9999, default value: 100. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_INDEX_NUM_ENTRIES = 100 - -# If the GENERATE_DOCSET tag is set to YES, additional index files will be -# generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_DOCSET = NO - -# This tag determines the name of the docset feed. A documentation feed provides -# an umbrella under which multiple documentation sets from a single provider -# (such as a company or product suite) can be grouped. -# The default value is: Doxygen generated docs. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_FEEDNAME = "Doxygen generated docs" - -# This tag specifies a string that should uniquely identify the documentation -# set bundle. This should be a reverse domain-name style string, e.g. -# com.mycompany.MyDocSet. Doxygen will append .docset to the name. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_BUNDLE_ID = org.doxygen.Project - -# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style -# string, e.g. com.mycompany.MyDocSet.documentation. -# The default value is: org.doxygen.Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_ID = org.doxygen.Publisher - -# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. -# The default value is: Publisher. -# This tag requires that the tag GENERATE_DOCSET is set to YES. - -DOCSET_PUBLISHER_NAME = Publisher - -# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three -# additional HTML index files: index.hhp, index.hhc, and index.hhk. The -# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. -# -# The HTML Help Workshop contains a compiler that can convert all HTML output -# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML -# files are now used as the Windows 98 help format, and will replace the old -# Windows help format (.hlp) on all Windows platforms in the future. Compressed -# HTML files also contain an index, a table of contents, and you can search for -# words in the documentation. The HTML workshop also contains a viewer for -# compressed HTML files. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_HTMLHELP = NO - -# The CHM_FILE tag can be used to specify the file name of the resulting .chm -# file. You can add a path in front of the file if the result should not be -# written to the html output directory. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_FILE = - -# The HHC_LOCATION tag can be used to specify the location (absolute path -# including file name) of the HTML help compiler ( hhc.exe). If non-empty -# doxygen will try to run the HTML help compiler on the generated index.hhp. -# The file has to be specified with full path. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -HHC_LOCATION = - -# The GENERATE_CHI flag controls if a separate .chi index file is generated ( -# YES) or that it should be included in the master .chm file ( NO). -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -GENERATE_CHI = NO - -# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc) -# and project file content. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -CHM_INDEX_ENCODING = - -# The BINARY_TOC flag controls whether a binary table of contents is generated ( -# YES) or a normal table of contents ( NO) in the .chm file. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -BINARY_TOC = NO - -# The TOC_EXPAND flag can be set to YES to add extra items for group members to -# the table of contents of the HTML help documentation and to the tree view. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTMLHELP is set to YES. - -TOC_EXPAND = NO - -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that -# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help -# (.qch) of the generated HTML documentation. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_QHP = NO - -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify -# the file name of the resulting .qch file. The path specified is relative to -# the HTML output folder. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QCH_FILE = - -# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help -# Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_NAMESPACE = org.doxygen.Project - -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt -# Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- -# folders). -# The default value is: doc. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_VIRTUAL_FOLDER = doc - -# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom -# filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_NAME = - -# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- -# filters). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_CUST_FILTER_ATTRS = - -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHP_SECT_FILTER_ATTRS = - -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. -# This tag requires that the tag GENERATE_QHP is set to YES. - -QHG_LOCATION = - -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be -# generated, together with the HTML files, they form an Eclipse help plugin. To -# install this plugin and make it available under the help contents menu in -# Eclipse, the contents of the directory containing the HTML and XML files needs -# to be copied into the plugins directory of eclipse. The name of the directory -# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. -# After copying Eclipse needs to be restarted before the help appears. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_ECLIPSEHELP = NO - -# A unique identifier for the Eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have this -# name. Each documentation set should have its own identifier. -# The default value is: org.doxygen.Project. -# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. - -ECLIPSE_DOC_ID = org.doxygen.Project - -# If you want full control over the layout of the generated HTML pages it might -# be necessary to disable the index and replace it with your own. The -# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top -# of each HTML page. A value of NO enables the index and the value YES disables -# it. Since the tabs in the index contain the same information as the navigation -# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -DISABLE_INDEX = NO - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. If the tag -# value is set to YES, a side panel will be generated containing a tree-like -# index structure (just like the one that is generated for HTML Help). For this -# to work a browser that supports JavaScript, DHTML, CSS and frames is required -# (i.e. any modern browser). Windows users are probably better off using the -# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -GENERATE_TREEVIEW = NO - -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that -# doxygen will group on one line in the generated HTML documentation. -# -# Note that a value of 0 will completely suppress the enum values from appearing -# in the overview section. -# Minimum value: 0, maximum value: 20, default value: 4. -# This tag requires that the tag GENERATE_HTML is set to YES. - -ENUM_VALUES_PER_LINE = 4 - -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used -# to set the initial width (in pixels) of the frame in which the tree is shown. -# Minimum value: 0, maximum value: 1500, default value: 250. -# This tag requires that the tag GENERATE_HTML is set to YES. - -TREEVIEW_WIDTH = 250 - -# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to -# external symbols imported via tag files in a separate window. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -EXT_LINKS_IN_WINDOW = NO - -# Use this tag to change the font size of LaTeX formulas included as images in -# the HTML documentation. When you change the font size after a successful -# doxygen run you need to manually remove any form_*.png images from the HTML -# output directory to force them to be regenerated. -# Minimum value: 8, maximum value: 50, default value: 10. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_FONTSIZE = 10 - -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering -# instead of using prerendered bitmaps. Use this if you do not have LaTeX -# installed or if you want to formulas look prettier in the HTML output. When -# enabled you may also need to install MathJax separately and configure the path -# to it using the MATHJAX_RELPATH option. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -USE_MATHJAX = NO - -# When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. -# Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. -# The default value is: HTML-CSS. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_FORMAT = HTML-CSS - -# When MathJax is enabled you need to specify the location relative to the HTML -# output directory using the MATHJAX_RELPATH option. The destination directory -# should contain the MathJax.js script. For instance, if the mathjax directory -# is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax -# Content Delivery Network so you can quickly see the result without installing -# MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest - -# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax -# extension names that should be enabled during MathJax rendering. For example -# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_EXTENSIONS = - -# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces -# of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an -# example see the documentation. -# This tag requires that the tag USE_MATHJAX is set to YES. - -MATHJAX_CODEFILE = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box for -# the HTML output. The underlying search engine uses javascript and DHTML and -# should work on any modern browser. Note that when using HTML help -# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) -# there is already a search function so this one should typically be disabled. -# For large projects the javascript based search engine can be slow, then -# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to -# search using the keyboard; to jump to the search box use + S -# (what the is depends on the OS and browser, but it is typically -# , /