Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/fiat/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ if( ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" )
endif()

target_compile_definitions( fiat PRIVATE ${FIAT_DEFINITIONS} )
target_compile_definitions( fiat PRIVATE fiat_GIT_SHA1="${fiat_GIT_SHA1}" )

if( HAVE_FCKIT )
target_link_libraries( fiat PRIVATE fckit )
Expand Down
253 changes: 182 additions & 71 deletions src/fiat/drhook/drhook.c
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,11 @@ static long long int opt_timeline_freq = 1000000; /* How often to print : every
static double opt_timeline_MB = 1.0; /* ... rss or curheap jumps up/down by more than this many MBytes (default = 1) : unit MBytes */

static volatile sig_atomic_t opt_gencore = 0;
static int opt_gencore_signal = 0;
/* signal 0 might not always be counted, but we can spare sizeof(int) to be careful */
static int opt_gencore_signals[NSIG + 1];
static int* opt_gencore_processes;
static int opt_gencore_user_specified = 0;
static int opt_gencore_all_procs = 0;

static int opt_random_memstat = 0; /* > 0 if to obtain random memory stats (maxhwm, maxstk) for tid=1. Updated when rand() % opt_random_memstat == 0 */

Expand Down Expand Up @@ -558,8 +562,6 @@ static drhook_calltree_t **thiscall = NULL;
static int signals_set = 0;
static volatile sig_atomic_t signal_handler_called = 0;
static volatile sig_atomic_t signal_handler_ignore_atexit = 0;
static volatile sig_atomic_t unlimited_corefile_retcode = 9999;
static volatile unsigned long long int saved_corefile_hardlimit = 0;
static int allow_coredump = 0; /* -1 denotes ALL MPI-tasks, 1..NPES == myproc, 0 = coredump will not be enabled by DrHook at init */
static drhook_sig_t siglist[1+NSIG] = { 0 };
static char *a_out = NULL;
Expand Down Expand Up @@ -751,7 +753,7 @@ static void dump_hugepages(int enforce, const char *pfx, int tid, int sig, int n

/*--- set_default_handler ---*/

static int set_unlimited_corefile(unsigned long long int *hardlimit, int enforce);
static int set_corefile_to_hard_limit(unsigned long long int *hardlimit, int enforce);

static int set_default_handler(int sig, int unlimited_corefile, int verbose)
{
Expand All @@ -766,12 +768,14 @@ static int set_default_handler(int sig, int unlimited_corefile, int verbose)
sigaddset(&sa.sa_mask, some_signal_to_be_blocked); ... just in case
*/
sigaction(sig, &sa, NULL);
if (unlimited_corefile) rc = set_unlimited_corefile(&hardlimit,0); /* unconditionally */
// TODO: Is this needed here?
// if (unlimited_corefile) rc = set_corefile_to_hard_limit(&hardlimit,0); /* unconditionally */
if (verbose) {
int tid = drhook_oml_get_thread_num();
char *pfx = PREFIX(tid);
char buf[128] = "";
if (unlimited_corefile && rc == 0) snprintf(buf,sizeof(buf)," -- hardlimit for core file is now %llu (0x%llx)", hardlimit, hardlimit);
// TODO: Is this needed here?
// if (unlimited_corefile && rc == 0) snprintf(buf,sizeof(buf)," -- hardlimit for core file is now %llu (0x%llx)", hardlimit, hardlimit);
fprintf(stderr,
"%s %s [%s@%s:%d] "
"Enabled default signal handler (SIG_DFL) for signal#%d%s\n",
Expand Down Expand Up @@ -1437,7 +1441,7 @@ ignore_signals(int silent)
#define DRH_GETRLIMIT getrlimit
#define DRH_SETRLIMIT setrlimit

static int set_unlimited_corefile(unsigned long long int *hardlimit, int enforce)
static int set_corefile_to_hard_limit(unsigned long long int *hardlimit, int enforce)
{
/*
Make sure we *only* set soft-limit (not hard-limit) to 0 in our scripts i.e. :
Expand All @@ -1446,46 +1450,77 @@ static int set_unlimited_corefile(unsigned long long int *hardlimit, int enforce
$ ulimit -c 0
See man ksh or man bash for more
*/
int rc = -1;
if (enforce || unlimited_corefile_retcode == 9999) { /* Done only once -- or if enforced*/
static int previously_set = 0;
static volatile unsigned long long int saved_corefile_hardlimit = 0;
/*
Mirror old behaviour where this either returns the previous successful value
or 0 if it was never successfully set
*/
if (hardlimit) *hardlimit = saved_corefile_hardlimit;

if (enforce || !previously_set) { /* Done only once -- or if enforced*/

DRH_STRUCT_RLIMIT r;
if (DRH_GETRLIMIT(RLIMIT_CORE, &r) == 0) {
r.rlim_cur = r.rlim_max;
if (DRH_SETRLIMIT(RLIMIT_CORE, &r) == 0) {
saved_corefile_hardlimit = r.rlim_cur;
rc = 0;
}
}
unlimited_corefile_retcode = rc;

if (DRH_GETRLIMIT(RLIMIT_CORE, &r)) return -1;
if (!r.rlim_cur) r.rlim_cur = r.rlim_max;

if (DRH_SETRLIMIT(RLIMIT_CORE, &r)) return -1;

saved_corefile_hardlimit = r.rlim_cur;
previously_set = 1;
}

if (hardlimit) *hardlimit = saved_corefile_hardlimit;
rc = unlimited_corefile_retcode;
return rc;
return 0;
}

static void
signal_gencore(int sig SIG_EXTRA_ARGS)
{
if (opt_gencore > 0) {
opt_gencore = 0; /* A tiny chance for a race condition between threads */
if (sig == opt_gencore_signal && sig >= 1 && sig <= NSIG) {
signal(sig, SIG_IGN);
signal(SIGABRT, SIG_DFL);
{ /* Enable unlimited cores (up to hard-limit) and call abort() --> generates core dump */
if (set_unlimited_corefile(NULL,1) == 0) {
int tid = drhook_oml_get_thread_num();
char *pfx = PREFIX(tid);
fprintf(stderr,
"%s %s [%s@%s:%d] Received signal#%d and now calling abort() ...\n",
pfx,TIMESTR(tid),FFL,
sig);
LinuxTraceBack(pfx,TIMESTR(tid),NULL);
abort(); /* Dump core, too */
if (opt_gencore) {
if ( sig >= 1 && sig <= NSIG && opt_gencore_signals[sig] ) {
/* User has specified procs & I'm that proc
* or user hasn't specified procs & either all procs dump or should attempt getting a lock */
if ( (opt_gencore_user_specified && opt_gencore_processes[myproc]) ||
(!opt_gencore_user_specified && (opt_gencore_all_procs || drhook_use_lockfile)) ) {
int fd = -1;
if (drhook_use_lockfile)
fd = open(drhook_lockfile,O_CREAT|O_WRONLY|O_TRUNC|O_EXCL,S_IRUSR|S_IWUSR);

/* Allowed through or gotten lock */
if (opt_gencore_all_procs || !drhook_use_lockfile || (drhook_use_lockfile && fd >= 0)) {

/* Ignore whatever signal brought us here (In case other processes get it too),
* and restore the default handler for aborts
*/
signal(sig, SIG_IGN);
signal(SIGABRT, SIG_DFL);
/* If we got through with a file lock, note some details and safely close it */
if (fd >= 0) {
size_t count = sizeof(myproc);
ssize_t sz = write(fd, &myproc, count); // Now we know which MPL-task got the lock (use octal-dump "od" command)
close(fd);
}

// TODO: Should set_corefile_to_hard_limit be here? We check it with process_options, but it could change between then and now
{ /* Enable unlimited cores (up to hard-limit) and call abort() --> generates core dump */
if (!set_corefile_to_hard_limit(NULL, 1)) {
int tid = drhook_oml_get_thread_num();
char *pfx = PREFIX(tid);
fprintf(stderr,
"%s %s [%s@%s:%d] Received signal#%d and now calling abort() ...\n",
pfx, TIMESTR(tid), FFL,
sig);
LinuxTraceBack(pfx, TIMESTR(tid), NULL);
abort(); /* Dump core, too. This should now call the kernel's handler */
}
}
/* Should never end up here */
_exit(128 + ABS(sig));
}
}
/* Should never end up here */
_exit(128+ABS(sig));
} /* if (sig == opt_gencore_signal && sig >= 1 && sig <= NSIG) */
} /* if ( sig >= 1 && sig <= NSIG && opt_gencore_signals[sig] ) */
}
}

Expand Down Expand Up @@ -1640,15 +1675,15 @@ signal_drhook(int sig SIG_EXTRA_ARGS)
" %lldMB (maxrss), %lldMB (maxstack), %lldMB (vmpeak), %lld (paging), nsigs = %d\n",
pfx,TIMESTR(tid),FFL,
sig, sl->name, hwm, rss, maxstack, vmpeak, pag, nsigs);
if (allow_coredump) {
unsigned long long int hardlimit = 0;
int rc = set_unlimited_corefile(&hardlimit,1);
if (rc == 0) {
fprintf(stderr,
"%s %s [%s@%s:%d] Hardlimit for core file is now %llu (0x%llx)\n",
pfx,TIMESTR(tid),FFL,hardlimit,hardlimit);
}
}
// if (allow_coredump) {
// unsigned long long int hardlimit = 0;
// int rc = set_corefile_to_hard_limit(&hardlimit,1);
// if (rc == 0) {
// fprintf(stderr,
// "%s %s [%s@%s:%d] Hardlimit for core file is now %llu (0x%llx)\n",
// pfx,TIMESTR(tid),FFL,hardlimit,hardlimit);
// }
// }

#if 1
fprintf(stderr,
Expand Down Expand Up @@ -2109,10 +2144,13 @@ signal_drhook_init(int enforce)
#endif
*/
catch_signals(silent); /* Additional signals to be seen by DR_HOOK */
if (opt_gencore > 0 && opt_gencore_signal >= 1 && opt_gencore_signal <= NSIG) {
drhook_sigfunc_t u;
u.func3args = signal_gencore;
signal(opt_gencore_signal, u.func1args); /* A facility to dump core */
if (opt_gencore) {
for (int cur_signal = 0; cur_signal <= NSIG; cur_signal++) {
if (!opt_gencore_signals[cur_signal]) continue;
drhook_sigfunc_t u;
u.func3args = signal_gencore;
signal(cur_signal, u.func1args); /* A facility to dump core */
}
}
signals_set = 1; /* Signals are set now */
}
Expand Down Expand Up @@ -2206,6 +2244,7 @@ process_options()

if(fp) fprintf(fp,"[EC_DRHOOK:hostname:myproc:omltid:pid:unixtid] [YYYYMMDD:HHMMSS:walltime] [function@file:lineno] -- Max OpenMP threads = %d\n",drhook_oml_get_max_threads());

OPTPRINT(fp,"%s %s [%s@%s:%d] Built from commit %s\n",pfx,TIMESTR(tid),FFL,BUILD_GIT_HASH);
OPTPRINT(fp,"%s %s [%s@%s:%d] fp = %p\n",pfx,TIMESTR(tid),FFL,(void*)fp);

env = getenv("ATP_ENABLED");
Expand All @@ -2223,24 +2262,6 @@ process_options()
OPTPRINT(fp,"%s %s [%s@%s:%d] ATP_IGNORE_SIGTERM=%d\n",pfx,TIMESTR(tid),FFL,atp_ignore_sigterm);
}

env = getenv("DR_HOOK_ALLOW_COREDUMP");
if (env) {
ienv = atoi(env);
allow_coredump = (ienv == -1 || ienv == myproc) ? ienv : 0;
}
OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_ALLOW_COREDUMP=%d\n",pfx,TIMESTR(tid),FFL,allow_coredump);
#if 0
// Postponed until DrHook actully has caught the signal
if (allow_coredump) {
unsigned long long int hardlimit = 0;
int rc = set_unlimited_corefile(&hardlimit,1);
if (rc == 0) {
OPTPRINT(fp,"%s %s [%s@%s:%d] Hardlimit for core file is now %llu (0x%llx)\n",
pfx,TIMESTR(tid),FFL,hardlimit,hardlimit);
}
}
#endif

env = getenv("DR_HOOK_PROFILE");
if (env) {
char *s = calloc_drhook(strlen(env) + 15, sizeof(*s));
Expand Down Expand Up @@ -2465,17 +2486,96 @@ process_options()
opt_gencore = atoi(env);
}

int print_gencore_signals = 0;
if (opt_gencore) {
OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_GENCORE=%d\n",pfx,TIMESTR(tid),FFL,opt_gencore);

/* This is here to not break the previous flags */
env = getenv("DR_HOOK_GENCORE_SIGNAL");
if (env) {
int itmp = atoi(env);
if (itmp >= 1 && itmp <= NSIG && itmp != SIGABRT) {
opt_gencore_signal = itmp;
opt_gencore_signals[itmp] = 1;
print_gencore_signals = 1;
}
}

env = getenv("DR_HOOK_GENCORE_SIGNALS");
if (env) {
print_gencore_signals = 1;
const char delim[] = ", \t/";
char *s = strdup_drhook(env);
char *p = strtok(s,delim);

while (p) {
int itmp = atoi(p);
if (1 <= itmp && itmp <= NSIG && itmp != SIGABRT)
opt_gencore_signals[itmp] = 1;
p = strtok(NULL,delim);
}
free_drhook(s);

if (print_gencore_signals) {
OPTPRINT(fp, "%s %s [%s@%s:%d] DR_HOOK_GENCORE_SIGNALS=", pfx, TIMESTR(tid), FFL);
for (int i = 0; i < NSIG; i++) {
OPTPRINT(fp, "%d:%d, ", i, opt_gencore_signals[i]);
}
OPTPRINT(fp, "%d:%d\n", NSIG, opt_gencore_signals[NSIG]);
}
}

env = getenv("DR_HOOK_GENCORE_PROCS");
if (env) {
// TODO: Is nproc the right size??
opt_gencore_processes = calloc_drhook(nproc, sizeof(int));
opt_gencore_user_specified = 1;
const char delim[] = ", \t/";
char *s = strdup_drhook(env);
char *p = strtok(s,delim);

while (p) {
int itmp = atoi(p);
if (0 <= itmp && itmp < nproc)
opt_gencore_processes[itmp] = 1;
p = strtok(NULL,delim);
}
free_drhook(s);

OPTPRINT(fp, "%s %s [%s@%s:%d] DR_HOOK_GENCORE_PROCS=", pfx, TIMESTR(tid), FFL);
for (int i = 0; i < nproc - 1; i++) {
OPTPRINT(fp, "%d:%d, ", i, opt_gencore_processes[i]);
}
OPTPRINT(fp, "%d:%d\n", nproc - 1, opt_gencore_processes[nproc - 1]);
}

/* Super secret flag that enables the fs killing opt_gencore_all_procs option */
env = getenv("DR_HOOK_SECRET");
if (env) {
opt_gencore_all_procs = atoi(env);

OPTPRINT(fp, "%s %s [%s@%s:%d] WARNING: The following option can easily bring down entire file systems on its own. "
"By enabling this, you are claiming you know what you're doing. If you do not, then disable it IMMEDIATELY!\n",
pfx, TIMESTR(tid), FFL);
OPTPRINT(fp, "%s %s [%s@%s:%d] DR_HOOK_SECRET=%d\n", pfx, TIMESTR(tid), FFL, opt_gencore_all_procs);
}
}

env = getenv("DR_HOOK_ALLOW_COREDUMP");
if (env) {
ienv = atoi(env);
allow_coredump = (ienv == -1 || ienv == myproc) ? ienv : 0;
}

/* opt_gencore implies allow_coredump */
allow_coredump |= opt_gencore;
OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_ALLOW_COREDUMP=%d\n",pfx,TIMESTR(tid),FFL,allow_coredump);

if (allow_coredump) {
unsigned long long int hardlimit = 0;
int rc = set_corefile_to_hard_limit(&hardlimit,1);
if (rc == 0) {
OPTPRINT(fp,"%s %s [%s@%s:%d] Hardlimit for core file is now %llu (0x%llx)\n",
pfx,TIMESTR(tid),FFL,hardlimit,hardlimit);
}
OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_GENCORE_SIGNAL=%d\n",pfx,TIMESTR(tid),FFL,opt_gencore_signal);
}

newline = 0;
Expand Down Expand Up @@ -3940,6 +4040,7 @@ c_drhook_print_(const int *ftnunitno,
)
{
static int first_time = 0;
static int reported_open_regions = 0;
int tid = (thread_id && (*thread_id >= 1) && (*thread_id <= numthreads))
? *thread_id : drhook_oml_get_thread_num();
int mytid = drhook_oml_get_thread_num();
Expand Down Expand Up @@ -4189,6 +4290,11 @@ c_drhook_print_(const int *ftnunitno,
cycles[t] += self_cycles;
}
nprof++;
} else if (keyptr->name && keyptr->status > 0 && !reported_open_regions) {
fprintf(stderr,
"%s %s [%s@%s:%d] WARNING: Region '%s' was never closed or stopped by a signal (Opened %d time(s) without closing). No output will be produced for this region.\n",
pfx,TIMESTR(tid),FFL, keyptr->name, keyptr->status);
reported_open_regions = 1;
}
keyptr = keyptr->next;
} /* while (keyptr && keyptr->status == 0) */
Expand Down Expand Up @@ -4511,6 +4617,11 @@ c_drhook_print_(const int *ftnunitno,
tot[t] += self;
maxseen_tot[t] = MAX(maxseen_tot[t], keyptr->mem_seenmax);
nprof++;
} else if (keyptr->name && keyptr->status > 0 && !reported_open_regions) {
fprintf(stderr,
"%s %s [%s@%s:%d] WARNING: Region '%s' was never closed or stopped by a signal (Opened %d time(s) without closing). No output will be produced for this region.\n",
pfx,TIMESTR(tid),FFL, keyptr->name, keyptr->status);
reported_open_regions = 1;
}
keyptr = keyptr->next;
} /* while (keyptr && keyptr->status == 0) */
Expand Down
2 changes: 2 additions & 0 deletions tests/drhook/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,5 @@ ecbuild_add_test( TARGET fiat_test_drhook_ex5

# TODO:
# Better parse output to see if it matches.

add_subdirectory(drhook_flags)
Loading