diff --git a/CMakeLists.txt b/CMakeLists.txt index 1469161b..81fd3ea2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,13 +23,17 @@ set(CMAKE_CXX_STANDARD_REQUIRED TRUE) ### Options ecbuild_add_option( FEATURE OMP - DESCRIPTION "support for OpenMP shared memory parallelism" + DESCRIPTION "Support for OpenMP shared memory parallelism" REQUIRED_PACKAGES "OpenMP COMPONENTS Fortran" ) ecbuild_add_option( FEATURE MPI DESCRIPTION "Support for MPI distributed parallelism" REQUIRED_PACKAGES "MPI COMPONENTS Fortran" ) +ecbuild_add_option( FEATURE DR_HOOK_PAPI + DESCRIPTION "Support for HW counters in DR_HOOK via PAPI" + REQUIRED_PACKAGES "PAPI") + ecbuild_find_package( fckit QUIET ) ecbuild_add_option( FEATURE FCKIT DESCRIPTION "Support for fckit" @@ -51,7 +55,6 @@ ecbuild_add_option( FEATURE WARNINGS DEFAULT ON DESCRIPTION "Add warnings to compiler" ) - ecbuild_add_option( FEATURE DR_HOOK_NVTX DEFAULT ${DEFAULT_DR_HOOK_NVTX} DESCRIPTION "Support for NVTX in DR_HOOK" diff --git a/cmake/FindPAPI.cmake b/cmake/FindPAPI.cmake new file mode 100644 index 00000000..35c3e6f9 --- /dev/null +++ b/cmake/FindPAPI.cmake @@ -0,0 +1,44 @@ +# Try to find PAPI headers and libraries. +# +# Usage of this module as follows: +# +# find_package(PAPI) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# PAPI_ROOT Set this variable to the root installation of +# libpapi if the module has problems finding the +# proper installation path. +# +# Variables defined by this module: +# +# PAPI_FOUND System has PAPI libraries and headers +# PAPI_LIBRARIES The PAPI library +# PAPI_INCLUDE_DIRS The location of PAPI headers + +find_path(PAPI_ROOT + NAMES include/papi.h +) + +find_library(PAPI_LIBRARIES + # Pick the static library first for easier run-time linking. + NAMES libpapi.so libpapi.a papi + HINTS ${PAPI_ROOT}/lib +) + +find_path(PAPI_INCLUDE_DIRS + NAMES papi.h + HINTS ${PAPI_ROOT}/include +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(PAPI DEFAULT_MSG + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) + +mark_as_advanced( + PAPI_LIBRARIES + PAPI_INCLUDE_DIRS +) diff --git a/src/fiat/CMakeLists.txt b/src/fiat/CMakeLists.txt index 99a5ce2e..08dcc332 100644 --- a/src/fiat/CMakeLists.txt +++ b/src/fiat/CMakeLists.txt @@ -83,6 +83,19 @@ if (HAVE_DR_HOOK_NVTX) endif() endif() + +if (HAVE_DR_HOOK_PAPI) + # Files from within DrHook + ecbuild_list_add_pattern( LIST fiat_papi_src GLOB *.c SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/drhook/extensions/papi) + target_sources(fiat PRIVATE ${fiat_papi_src}) + target_include_directories(fiat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/drhook/extensions/papi) + + # Files defined externally + target_link_libraries ( fiat PRIVATE ${PAPI_LIBRARIES} ) + target_include_directories ( fiat PRIVATE ${PAPI_INCLUDE_DIRS} ) + target_compile_definitions ( fiat PRIVATE DR_HOOK_HAVE_PAPI=1 ) +endif() + if( ${CMAKE_SYSTEM_NAME} MATCHES "Darwin" ) # Following should not be necessary; # Probably a bug in the M1 prerelease of gfortran 10.2.0.4 @@ -103,7 +116,7 @@ else() endif() if( HAVE_OMP ) - target_link_libraries( fiat PRIVATE OpenMP::OpenMP_Fortran ) + target_link_libraries( fiat PRIVATE OpenMP::OpenMP_Fortran ) endif() fiat_target_ignore_missing_symbols( TARGET fiat SYMBOLS diff --git a/src/fiat/drhook/drhook.c b/src/fiat/drhook/drhook.c index 8074d8d5..9743232b 100644 --- a/src/fiat/drhook/drhook.c +++ b/src/fiat/drhook/drhook.c @@ -1,7 +1,7 @@ /* * (C) Copyright 2005- ECMWF. * (C) Copyright 2013- Meteo-France. - * + * * This software is licensed under the terms of the Apache Licence Version 2.0 * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. * In applying this licence, ECMWF does not waive the privileges and immunities @@ -15,7 +15,7 @@ #define _GNU_SOURCE -/* +/* drhook.c Author: Sami Saarinen, ECMWF, 14..24-Nov-2003 @@ -82,9 +82,17 @@ static int backtrace(void **buffer, int size) { return 0; } #include #include #include + +// Extension headers #ifdef DR_HOOK_HAVE_NVTX #include "dr_hook_nvtx.h" #endif +#ifdef DR_HOOK_HAVE_PAPI +#include "drhook_papi.h" +#else +// This type is in the signature of remove_calltree() +#define long_long long long +#endif #include "ec_get_cycles.h" static long long int *thread_cycles = NULL; @@ -142,7 +150,7 @@ static void drhook_oml_init_lock() { } oml_init_lockid_with_name(&DRHOOK_lock, "drhook.c:DRHOOK_lock"); oml_set_debug(saved_state); -} +} #if !defined(CACHELINESIZE) #if defined(LEVEL1_DCACHE_LINESIZE) @@ -305,13 +313,14 @@ static int opt_getpag = 0; static int opt_walltime = 0; static int opt_cputime = 0; static int opt_wallprof = 0; +static int opt_papi = 0; static int opt_cpuprof = 0; static int opt_memprof = 0; static int opt_cycles = 0; static int opt_trim = 0; static int opt_calls = 0; -static int opt_self = 1; /* 0=exclude drhook altogether, - 1=include, but don't print, +static int opt_self = 1; /* 0=exclude drhook altogether, + 1=include, but don't print, 2=also print */ static int opt_propagate_signals = 1; static int opt_sizeinfo = 1; @@ -335,9 +344,9 @@ static int opt_funcenter = 0; static int opt_funcexit = 0; static int opt_timeline = 0; /* myproc or -1 [or 0 for --> timeline feature off (default)] */ -static int opt_timeline_thread = 1; /* thread-id control : +static int opt_timeline_thread = 1; /* thread-id control : <= 0 print for all threads - 1 -> #1 only [but curheap still SUM of all threads] (default), + 1 -> #1 only [but curheap still SUM of all threads] (default), n -> print for increasing number of threads separately : [1..n] */ static int opt_timeline_format = 1; /* if 1, print only {wall,hwm,rss,curheap} w/o labels "wall=" etc.; else fully expanded fmt */ static int opt_timeline_unitno = 6; /* Fortran unit number : default = 6 i.e. stdout */ @@ -350,7 +359,7 @@ static int opt_gencore_signal = 0; static int opt_random_memstat = 0; /* > 0 if to obtain random memory stats (maxhwm, maxstk) for tid=1. Updated when rand() % opt_random_memstat == 0 */ static double opt_trace_stack = 0; /* if > 0, a multiplier for OMP_STACKSIZE to monitor high master thread stack usage -- - -- implies opt_random_memstat = 1 (regardless of DR_HOOK_RANDOM_MEMSTAT setting) + -- implies opt_random_memstat = 1 (regardless of DR_HOOK_RANDOM_MEMSTAT setting) -- for master MPI task only (for the moment) */ static long long int drhook_oml_stacksize = 0; /* Slave stack size -- an indicative stack size even master thread should not exceed */ @@ -388,7 +397,7 @@ static drhook_timeline_t *timeline = NULL; #define SA_SIGINFO 0 #define SIG_EXTRA_ARGS /* empty */ #define SIG_PASS_EXTRA_ARGS /* empty */ -#else +#else #define SIG_EXTRA_ARGS , siginfo_t *sigcode, void *sigcontextptr #define SIG_PASS_EXTRA_ARGS , sigcode, sigcontextptr #endif @@ -470,7 +479,7 @@ typedef struct drhook_key_t { double wall_in, delta_wall_all, delta_wall_child; double cpu_in, delta_cpu_all, delta_cpu_child; long long int cycles_in, delta_cycles_all, delta_cycles_child; - char *filename; /* the filename where the 1st call (on this routine-name) + char *filename; /* the filename where the 1st call (on this routine-name) to dr_hook() occurred */ long long int sizeinfo; /* # of data elements, bytes, etc. */ long long int min_sizeinfo, max_sizeinfo; /* min & max of # of data elements, bytes, etc. */ @@ -480,6 +489,13 @@ typedef struct drhook_key_t { long long int maxmem_selfdelta, maxmem_alldelta; long long int mem_maxhwm, mem_maxrss, mem_maxstk, mem_maxpagdelta; long long int paging_in; + +#if defined(DR_HOOK_HAVE_PAPI) + long_long counters_in[MAXNPAPICNTRS]; + long_long delta_counters_all[MAXNPAPICNTRS]; + long_long delta_counters_child[MAXNPAPICNTRS]; +#endif + unsigned long long int alloc_count, free_count; #if defined(DR_HOOK_HAVE_NVTX) unsigned long long int skipped_nvtx_calls; @@ -511,6 +527,10 @@ typedef struct drhook_prof_t { double pc; double total; double self; +#if defined(DR_HOOK_HAVE_PAPI) + long_long counter_tot[MAXNPAPICNTRS]; + long_long counter_self[MAXNPAPICNTRS]; +#endif unsigned long long int calls; double percall_ms_self; double percall_ms_total; @@ -606,7 +626,7 @@ static void set_ec_drhook_label(const char *hostname, long hlen) #define NSECS(x) ((int)(1000000000 * ((x) - SECS(x)))) #ifdef _DRHOOK_TIMER_T_ -static void set_killer_timer(const int *ntids, const int *target_omltid, +static void set_killer_timer(const int *ntids, const int *target_omltid, const int *target_sig, const double *start_time, const char *p, long plen) { @@ -630,13 +650,13 @@ static void set_killer_timer(const int *ntids, const int *target_omltid, sev.sigev_notify = SIGEV_SIGNAL; #endif sev.sigev_value.sival_ptr = &timerid; - + its.it_value.tv_sec = SECS(*start_time); its.it_value.tv_nsec = NSECS(*start_time); - + its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0; - + #if defined(CLOCK_BOOTTIME) timer_create(CLOCK_BOOTTIME, &sev, &timerid); #else @@ -644,7 +664,7 @@ static void set_killer_timer(const int *ntids, const int *target_omltid, #endif /* timer_create(CLOCK_REALTIME, &sev, &timerid); */ timer_settime(timerid, 0, &its, NULL); - + cas_lock(&TimedKill); { fprintf(stderr, @@ -797,14 +817,13 @@ static int set_default_handler(int sig, int unlimited_corefile, int verbose) /*--- malloc_drhook ---*/ -static void * -malloc_drhook(size_t size) +void *malloc_drhook(size_t size) { size_t size1 = MAX(1,size); void *p = malloc(size1); if (!p) { fprintf(stderr, - "***Error in malloc_drhook(): Unable to allocate space for %lld bytes\n", + "***Error in malloc_drhook(): Unable to allocate space for %lld bytes\n", (long long int)size1); DRHOOK_ABORT(); } @@ -1048,9 +1067,11 @@ insert_calltree(int tid, drhook_key_t *keyptr) /*--- remove_calltree ---*/ -static void -remove_calltree(int tid, drhook_key_t *keyptr, - const double *delta_wall, const double *delta_cpu, const long long int *delta_cycles) +static void +remove_calltree(int tid, drhook_key_t *keyptr, + const double *delta_wall, const double *delta_cpu, + const long long int *delta_cycles,long_long * delta_counters + ) { if (tid >= 1 && tid <= numthreads) { drhook_calltree_t *treeptr = thiscall[tid-1]; @@ -1059,6 +1080,13 @@ remove_calltree(int tid, drhook_key_t *keyptr, if (treeptr->prev) { drhook_key_t *parent_keyptr = treeptr->prev->keyptr; if (parent_keyptr) { /* extra security */ +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) + drhook_papi_add(NULL, + parent_keyptr->delta_counters_child, + delta_counters + ); +#endif if (opt_walltime) { parent_keyptr->delta_wall_child += (*delta_wall); } @@ -1133,12 +1161,12 @@ memstat(drhook_key_t *keyptr, const int *thread_id, int in_getkey) if (opt_memprof) { keyptr->mem_seenmax = getmaxcurheap_thread_(thread_id); if (in_getkey) { /* Upon enter of a Dr.Hook'ed routine */ - /* A note for "keyptr->mem_curdelta": + /* A note for "keyptr->mem_curdelta": 1) do not reset to 0 2) initially calloc'ed to 0 while initializing the keydata[] ~ alias keyptr 3) remember the previous value --> catches memory leaks, too !! */ /* keyptr->mem_curdelta = 0; */ - /* Nearly the same holds for "keyptr->mem_child"; + /* Nearly the same holds for "keyptr->mem_child"; we need to capture the maximum/hwm for child */ /* keyptr->mem_child = 0; */ keyptr->paging_in = keyptr->paging; @@ -1357,7 +1385,7 @@ ignore_one_signal(int sig, int silent) { int tid = drhook_oml_get_thread_num(); char *pfx = PREFIX(tid); fprintf(stderr, - "%s %s [%s@%s:%d] DR_HOOK ignores signal#%d (%s)\n", + "%s %s [%s@%s:%d] DR_HOOK ignores signal#%d (%s)\n", pfx,TIMESTR(tid),FFL, sig,strsignal(sig)); } @@ -1369,7 +1397,7 @@ static void ignore_signals(int silent) { char *env = getenv("DR_HOOK_IGNORE_SIGNALS"); - + if (!silent && myproc == 1) { int tid = drhook_oml_get_thread_num(); char *pfx = PREFIX(tid); @@ -1452,7 +1480,7 @@ ignore_signals(int silent) static int set_unlimited_corefile(unsigned long long int *hardlimit, int enforce) { - /* + /* Make sure we *only* set soft-limit (not hard-limit) to 0 in our scripts i.e. : $ ulimit -S -c 0 but *not* @@ -1476,10 +1504,10 @@ static int set_unlimited_corefile(unsigned long long int *hardlimit, int enforce return rc; } -static void +static void signal_gencore(int sig SIG_EXTRA_ARGS) { - if (opt_gencore > 0) { + if (opt_gencore > 0) { opt_gencore = 0; /* A tiny chance for a race condition between threads */ if (sig == opt_gencore_signal && sig >= 1 && sig <= NSIG) { signal(sig, SIG_IGN); @@ -1525,7 +1553,7 @@ static char *safe_llitoa(long long int i, char b[], int blen) } -static void +static void signal_harakiri(int sig SIG_EXTRA_ARGS) { /* A signal handler that will force to exit the current thread immediately for sure */ @@ -1563,12 +1591,12 @@ signal_harakiri(int sig SIG_EXTRA_ARGS) #if 0 batch_kill_(); #endif - + raise(SIGKILL); /* Use raise, not RAISE here */ _exit(128+ABS(sig)); /* Should never reach here, bu' in case it does, then ... */ } -static void +static void signal_drhook(int sig SIG_EXTRA_ARGS) { volatile int nfirst = drhook_use_lockfile ? 0 : 1; @@ -1584,7 +1612,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) if (sig < 1 || sig > NSIG) return; // .. since have seen this, too :-( if (been_here_already++ > 0) return; // avoid calling more than once ... since it leads more often than not into troubles - + cas_lock(&thing); unixtid = ec_gettid(); @@ -1597,7 +1625,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) sigset_t newmask, oldmask; /* A tiny chance for a race condition between threads */ - // Using compare-and-swap -stuff from the include cas.h (also in ecProf) + // Using compare-and-swap -stuff from the include cas.h (also in ecProf) /* Signal catching */ { @@ -1606,24 +1634,24 @@ signal_drhook(int sig SIG_EXTRA_ARGS) } if (ec_drhook && tid >= 1 && tid <= numthreads) ec_drhook[tid-1].nsigs = nsigs; /* Store for possible signal_harakiri() */ - - /*------------------------------------------------------------ + + /*------------------------------------------------------------ Strategy: - drhook intercepts most interrupts. - - 1st interupt will + - 1st interupt will - call alarm(10) to try to make sure 2nd interrupt received - try to call tracebacks and exit (which includes atexits) - - 2nd (and subsequent) interupts will - - spin for 20 sec (to give 1st interrupt time to complete tracebacks) + - 2nd (and subsequent) interupts will + - spin for 20 sec (to give 1st interrupt time to complete tracebacks) - and then call _exit (bypassing atexit) ------------------------------------------------------------*/ - + /* if (sig != SIGTERM) signal(SIGTERM, SIG_DFL); */ /* Let the default SIGTERM to occur */ - + // max_threads = drhook_oml_get_max_threads(); if (nsigs == 1) { /*---- First call to signal handler: call alarm(drhook_harakiri_timeout), tracebacks, exit ------*/ - + if (!nfirst) { // Correct coding : one and only one task obtains exclusive creation mask -- others fire blanks! int fd = open(drhook_lockfile,O_CREAT|O_WRONLY|O_TRUNC|O_EXCL,S_IRUSR|S_IWUSR); @@ -1756,7 +1784,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) fprintf(stderr, "%s %s [%s@%s:%d] Signal#%d was caused by %s [memaddr=%p] [excepts=0x%x [%d]] : %p at %s(%s), nsigs = %d\n", pfx,TIMESTR(tid),FFL, - sig, s, + sig, s, addr, excepts, excepts, bt, @@ -1768,7 +1796,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) fprintf(stderr, "%s %s [%s@%s:%d] Signal#%d was caused by %s [memaddr=%p] : %p at %s(%s), nsigs = %d\n", pfx,TIMESTR(tid),FFL, - sig, s, + sig, s, addr, bt, dlinfo.dli_fname ? dlinfo.dli_fname : "", @@ -1818,17 +1846,17 @@ signal_drhook(int sig SIG_EXTRA_ARGS) } /* All below this point should be nsigs == 1 i.e. the first threat arriving signal_drhook() */ - + /* sigfillset(&newmask); -- dead code since sigprocmask() was not called */ /* sigemptyset(&newmask); sigaddset(&newmask, sig); */ - + /* Start critical region (we don't want any signals to interfere while doing this) */ /* sigprocmask(SIG_BLOCK, &newmask, &oldmask); */ - - if (nsigs == 1 && nfirst) { + + if (nsigs == 1 && nfirst) { /* Print Dr.Hook traceback */ const int ftnunitno = 0; /* stderr */ const int print_option = 2; /* calling tree */ @@ -1866,37 +1894,37 @@ signal_drhook(int sig SIG_EXTRA_ARGS) /* To make it less likely that another thread generates a signal while we are doing a traceback lets wait a while (seems to fix problems of the traceback terminating abnormally. Probably a better way of doing this involving holding - off signals but sigprocmask is not safe in multithreaded code - P Towers Dec 10 2012 + off signals but sigprocmask is not safe in multithreaded code - P Towers Dec 10 2012 This was originally an issue with the Intel compiler but may be of benefit for other - compilers. Cannot see it doing harm - P Towers Aug 29 2013 */ + compilers. Cannot see it doing harm - P Towers Aug 29 2013 */ // spin(MIN(5,tid)); // obsolete: only one thread (and task) ever gets here ! if (sig != SIGABRT && sig != SIGTERM) { #if (defined(LINUX) || defined(__APPLE__)) LinuxTraceBack(pfx,TIMESTR(tid),NULL); #endif - + #ifdef __INTEL_COMPILER intel_trbk_(); /* from ../utilities/gentrbk.F90 */ -#endif +#endif } - - fprintf(stderr, - "%s %s [%s@%s:%d] DrHook backtrace done for signal#%d, nsigs = %d\n", + + fprintf(stderr, + "%s %s [%s@%s:%d] DrHook backtrace done for signal#%d, nsigs = %d\n", pfx,TIMESTR(tid),FFL,sig,nsigs); } - + /* sigprocmask(SIG_SETMASK, &oldmask, 0); */ /* End critical region : the original signal state restored */ - + { int restored = 0, tdiff; time_t t1, t2; drhook_sigfunc_t u; u.func3args = signal_drhook; if (opt_propagate_signals && - sl->old.sa_handler != SIG_DFL && - sl->old.sa_handler != SIG_IGN && + sl->old.sa_handler != SIG_DFL && + sl->old.sa_handler != SIG_IGN && sl->old.sa_handler != u.func1args) { u.func1args = sl->old.sa_handler; @@ -1928,7 +1956,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) set_default_handler(SIGSEGV,1,1); restored = 1; break; - default: + default: break; } } @@ -1936,7 +1964,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) fprintf(stderr, "%s %s [%s@%s:%d] Calling previous signal handler at %p for signal#%d, nsigs = %d\n", pfx,TIMESTR(tid),FFL, - (void*) u.func1args,sig,nsigs); + (void*) u.func1args,sig,nsigs); time(&t1); u.func3args(sig SIG_PASS_EXTRA_ARGS); /* This could now be the ATP */ @@ -1947,7 +1975,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) "%s %s [%s@%s:%d] Returned from previous signal handler" " (at %p, signal#%d, time taken = %ds), nsigs = %d\n", pfx,TIMESTR(tid),FFL, - (void*) u.func1args,sig,tdiff,nsigs); + (void*) u.func1args,sig,tdiff,nsigs); if (atp_enabled && restored && atp_max_cores > 0) { /* Assuming it was indeed ATP, then lets spin a bit to allow other cores be dumped */ @@ -1986,7 +2014,7 @@ signal_drhook(int sig SIG_EXTRA_ARGS) } } } - + { int errcode = 128 + ABS(sig); /* Make sure that the process/thread really exits now -- immediately !! */ @@ -2001,19 +2029,19 @@ signal_drhook(int sig SIG_EXTRA_ARGS) /*--- signal_drhook_init ---*/ -static void +static void signal_drhook_init(int enforce) { char *env = getenv("DR_HOOK_SILENT"); int silent = env ? atoi(env) : 0; int j; - dr_hook_procinfo_(&myproc, &nproc); - if (myproc < 1) myproc = 1; /* Just to enable output as if myproc was == 1 */ - /* Signals may not yet been set, since MPI not initialized - Only enforce-parameter can enforce to set these => no output on myproc=1 */ - if (!enforce && (myproc < 1 || nproc < 0)) return; + int mpi_init; + c_dr_hook_procinfo(&myproc, &nproc, &mpi_init); + /* Signals may not yet been set, since MPI not initialized + Enforce parameter for setting signals regardless of MPI state */ + if (!enforce && !mpi_init) return; if (signals_set) return; /* Extra safety */ - /* To present sumpini.F90 (f.ex.) initializing DrHook-signals in case of + /* To present sumpini.F90 (f.ex.) initializing DrHook-signals in case of DR_HOOK was turned off (=0), then set also export DR_HOOK_INIT_SIGNALS=0 */ env = getenv("DR_HOOK_INIT_SIGNALS"); if (env && *env == '0') { @@ -2160,6 +2188,21 @@ get_memmon_out(int me) return s; } +/*--- get_memmon_out ---*/ + +static char * +get_csv_out(int me) +{ + char *s = NULL; + char *p = get_mon_out(me); + if (p) { + s = malloc_drhook((strlen(p) + 5) * sizeof(*s)); + sprintf(s,"%s.csv",p); + } + if (!s) s = strdup_drhook("drhook.prof.0.csv"); + return s; +} + /*--- random_memstat ---*/ static void @@ -2171,7 +2214,7 @@ random_memstat(int tid, int enforce) long long int maxhwm = getmaxhwm_(); long long int maxstk = getmaxstk_(); if (drhook_stacksize_threshold > 0 && maxstk > drhook_stacksize_threshold) { - /* Abort hopefully with traceback */ + /* Abort hopefully with traceback */ char *pfx = PREFIX(tid); long long int vmpeak = getvmpeak_() / (long long int) 1048576; long long int threshold = drhook_stacksize_threshold / (long long int) 1048576; @@ -2221,7 +2264,9 @@ process_options() if(fp) fprintf(fp,"[EC_DRHOOK:hostname:myproc:omltid:pid:unixtid] [YYYYMMDD:HHMMSS:walltime] [function@file:lineno] -- Max OpenMP threads = %d\n",drhook_oml_get_max_threads()); OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_SILENT=%d\n",pfx,TIMESTR(tid),FFL,opt_silent); - OPTPRINT(fp,"%s %s [%s@%s:%d] fp = %p\n",pfx,TIMESTR(tid),FFL,(void*)fp); + // Compiler gets concerned that we may be reading and writing to fp otherwise... + void *definitely_not_fp = (void*)fp; + OPTPRINT(fp,"%s %s [%s@%s:%d] fp = %p\n",pfx,TIMESTR(tid),FFL,definitely_not_fp); env = getenv("ATP_ENABLED"); atp_enabled = env ? atoi(env) : 0; @@ -2250,7 +2295,7 @@ process_options() unsigned long long int hardlimit = 0; int rc = set_unlimited_corefile(&hardlimit,1); if (rc == 0) { - OPTPRINT(fp,"%s %s [%s@%s:%d] Hardlimit for core file is now %llu (0x%llx)\n", + OPTPRINT(fp,"%s %s [%s@%s:%d] Hardlimit for core file is now %llu (0x%llx)\n", pfx,TIMESTR(tid),FFL,hardlimit,hardlimit); } } @@ -2298,7 +2343,7 @@ process_options() if (env) { opt_timeline = atoi(env); } - + if (opt_timeline) { OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_TIMELINE=%d\n",pfx,TIMESTR(tid),FFL,opt_timeline); @@ -2307,13 +2352,13 @@ process_options() opt_timeline_thread = atoi(env); } OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_TIMELINE_THREAD=%d\n",pfx,TIMESTR(tid),FFL,opt_timeline_thread); - + env = getenv("DR_HOOK_TIMELINE_FORMAT"); if (env) { opt_timeline_format = atoi(env); } OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_TIMELINE_FORMAT=%d\n",pfx,TIMESTR(tid),FFL,opt_timeline_format); - + env = getenv("DR_HOOK_TIMELINE_UNITNO"); if (env) { opt_timeline_unitno = atoi(env); @@ -2338,7 +2383,7 @@ process_options() env = getenv("DR_HOOK_TRACE_STACK"); if (env) { opt_trace_stack = atof(env); - if (opt_trace_stack < 0) + if (opt_trace_stack < 0) opt_trace_stack = 0; else { drhook_oml_stacksize = slave_stacksize(); @@ -2365,7 +2410,7 @@ process_options() } OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_RANDOM_MEMSTAT=%d (RAND_MAX=%d)\n",pfx,TIMESTR(tid),FFL,opt_random_memstat,RAND_MAX); - + env = getenv("DR_HOOK_HASHBITS"); if (env) { int value = atoi(env); @@ -2482,7 +2527,7 @@ process_options() if (opt_gencore) { OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_GENCORE=%d\n",pfx,TIMESTR(tid),FFL,opt_gencore); - + env = getenv("DR_HOOK_GENCORE_SIGNAL"); if (env) { int itmp = atoi(env); @@ -2531,7 +2576,7 @@ process_options() if (opt_nvtx_SWT < 0) opt_nvtx_SWT = nvtx_SWT_default; - OPTPRINT(fp, "%s %s [%s@%s:%g] DR_HOOK_NVTX_SPAM_WT=%g\n", pfx, TIMESTR(tid), FFL, nvtx_SWT_default); + OPTPRINT(fp, "%s %s [%s@%s:%d] DR_HOOK_NVTX_SPAM_WT=%g\n", pfx, TIMESTR(tid), FFL, nvtx_SWT_default); } } @@ -2545,7 +2590,7 @@ process_options() while (*p) { if (islower(*p)) *p = toupper(*p); p++; - } + } p = strtok(s,delim); /* if (p) OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_OPT=\"",pfx,TIMESTR(tid)); */ if (p && fp) { @@ -2555,7 +2600,7 @@ process_options() while (p) { /* Assume that everything is OFF by default */ if (strequ(p,"ALL")) { /* all except profiler data */ - opt_gethwm = opt_getstk = opt_getrss = opt_getpag = opt_walltime = opt_cputime = opt_cycles = 1; + opt_papi = opt_gethwm = opt_getstk = opt_getrss = opt_getpag = opt_walltime = opt_cputime = opt_cycles = 1; opt_calls = 1; any_memstat++; OPTPRINT(fp,"%s%s",comma,"ALL"); comma = ","; @@ -2625,6 +2670,15 @@ process_options() opt_cycles = 1; OPTPRINT(fp,"%s%s",comma,"WALLPROF"); comma = ","; } + else if (strequ(p,"COUNTERS") ) { + opt_wallprof = 1; + opt_walltime = 1; + opt_cpuprof = 0; /* Note: Switches cpuprof OFF */ + opt_calls = 1; + opt_cycles = 1; + opt_papi = 1; + OPTPRINT(fp,"%s%s",comma,"COUNTERS"); comma = ","; + } else if (strequ(p,"CPUPROF")) { opt_cpuprof = 1; opt_cputime = 1; @@ -2660,6 +2714,8 @@ process_options() else if (strequ(p,"CALLPATH")) { opt_callpath = 1; OPTPRINT(fp,"%s%s",comma,"CALLPATH"); comma = ","; + } else { + printf("DrHook: Warning - no match for HOOK_OPT : %s\n",p); } p = strtok(NULL,delim); } @@ -2677,20 +2733,20 @@ process_options() if (callpath_indent < 1 || callpath_indent > 8) callpath_indent = callpath_indent_default; } OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_CALLPATH_INDENT=%d\n",pfx,TIMESTR(tid),FFL,callpath_indent); - + env = getenv("DR_HOOK_CALLPATH_DEPTH"); if (env) { callpath_depth = atoi(env); if (callpath_depth < 0) callpath_depth = callpath_depth_default; } OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_CALLPATH_DEPTH=%d\n",pfx,TIMESTR(tid),FFL,callpath_depth); - + env = getenv("DR_HOOK_CALLPATH_PACKED"); if (env) { callpath_packed = atoi(env); } OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_CALLPATH_PACKED=%d\n",pfx,TIMESTR(tid),FFL,callpath_packed); - + env = getenv("DR_HOOK_CALLTRACE"); if (env) { opt_calltrace = atoi(env); @@ -2698,6 +2754,49 @@ process_options() OPTPRINT(fp,"%s %s [%s@%s:%d] DR_HOOK_CALLTRACE=%d\n",pfx,TIMESTR(tid),FFL,opt_calltrace); } +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) { + newline = 0; + env = getenv("DR_HOOK_PAPI_COUNTERS"); + if (env) { + const char delim[] = ", \t/"; + char *comma = " DR_HOOK_PAPI_COUNTERS=\""; + char *s = strdup_drhook(env); + char *p = s; + while (*p) { + if (islower(*p)) *p = toupper(*p); + p++; + } + p = strtok(s,delim); + if (p && fp) { + fprintf(fp,"%s %s [%s@%s:%d]",pfx,TIMESTR(tid),FFL); + newline = 1; + } + for (int i = 0; p && i < drhook_papi_max_num_counters(); p = strtok(NULL,delim), i++) { + drhook_papi_add_counter_name(strdup_drhook(p)); + OPTPRINT(fp,"%s%s",comma,p); comma = ","; + } + + free_drhook(s); + if (*comma == ',') { + OPTPRINT(fp,"\"\n"); + newline = 0; + } + if (newline) OPTPRINT(fp,"\n"); + } else { + const char* default_events[4] = { + "PAPI_TOT_CYC", + "PAPI_FP_OPS", + "PAPI_L1_DCA", + "PAPI_L2_DCM" + }; + for (int i = 0; i < 4; i++) { + drhook_papi_add_counter_name(strdup_drhook(default_events[i])); + } + } + } +#endif + if (opt_wallprof || opt_cpuprof || opt_memprof || opt_timeline) { atexit(do_prof); } @@ -2807,7 +2906,7 @@ getkey(int tid, const char *name, int name_len, keyptr->name[name_len] = 0; } if (filename_len > 0 && - filename && + filename && *filename) { char *psave = NULL; char *p = psave = malloc_drhook((filename_len+1)*sizeof(*filename)); @@ -2828,9 +2927,9 @@ getkey(int tid, const char *name, int name_len, } found = 1; } - if (found || + if (found || (keyptr->name_len == name_len && - (!callpath || (callpath && keyptr->callpath && + (!callpath || (callpath && keyptr->callpath && keyptr->callpath_len == callpath_len && keyptr->callpath_fullhash == fullhash)) && ((!opt_trim && *keyptr->name == *name && strnequ(keyptr->name, name, name_len)) || @@ -2838,6 +2937,9 @@ getkey(int tid, const char *name, int name_len, if (opt_walltime) keyptr->wall_in = walltime ? *walltime : WALLTIME(); if (opt_cputime) keyptr->cpu_in = cputime ? *cputime : CPUTIME(); if (opt_cycles) keyptr->cycles_in = cycles ? *cycles : ec_get_cycles(); +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) drhook_papi_readAll(keyptr->counters_in); +#endif if (any_memstat) memstat(keyptr,&tid,1); if (opt_calls) { keyptr->calls++; @@ -2845,7 +2947,7 @@ getkey(int tid, const char *name, int name_len, } #if defined(DR_HOOK_HAVE_NVTX) // Helps filter out wrapper calls that may be noise - if (opt_nvtx && drhook_oml_get_thread_num() == 1){ + if (opt_nvtx && tid == 1){ if (keyptr->calls > opt_nvtx_SCC && keyptr->delta_wall_all < opt_nvtx_SWT) { if (!opt_silent) fprintf(stderr,"DRHOOK:NVTX: Skipping opening of region %s\n", keyptr->name); @@ -2996,16 +3098,25 @@ putkey(int tid, drhook_key_t *keyptr, const char *name, int name_len, } } #endif - remove_calltree(tid, keyptr, &delta_wall, &delta_cpu, &delta_cycles); + long_long * delta_counters = NULL; +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) { + delta_counters = alloca(drhook_papi_num_counters() * sizeof(long_long)); + drhook_papi_bzero(delta_counters); + drhook_papi_subtract(delta_counters, NULL, keyptr->counters_in); + drhook_papi_add(NULL, keyptr->delta_counters_all, delta_counters); + } +#endif + remove_calltree(tid, keyptr, &delta_wall, &delta_cpu, &delta_cycles, delta_counters); } } - + /*--- init_drhook ---*/ static void init_drhook(int ntids) { - if (numthreads == 0 || !keydata || !calltree || !keyself || !overhead || !curkeyptr || !cstk) { + if (numthreads == 0 || !keydata || !calltree || !keyself || !overhead || !curkeyptr || !cstk) { int j; if (pid == -1) { /* Ensure that called just once */ { @@ -3112,9 +3223,9 @@ if (overhead && tid >= 1 && tid <= numthreads) { \ } static drhook_key_t * -itself(drhook_key_t *keyptr_self, - int tid, int opt, double *delta_time, - const double *walltime, const double *cputime) +itself(drhook_key_t *keyptr_self, + int tid, int opt, double *delta_time, + const double *walltime, const double *cputime) { drhook_key_t *keyptr = NULL; if (keyself) { @@ -3122,6 +3233,10 @@ itself(drhook_key_t *keyptr_self, if (opt == 0) { if (opt_wallprof) keyptr->wall_in = walltime ? *walltime : WALLTIME(); else keyptr->cpu_in = cputime ? *cputime : CPUTIME(); +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) + drhook_papi_readAll(keyptr->counters_in); +#endif keyptr->calls++; } else if (opt == 1) { @@ -3135,6 +3250,19 @@ itself(drhook_key_t *keyptr_self, keyptr->delta_cpu_all += delta; } if (delta_time) *delta_time = delta; + +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) { + long_long cntrs_delta[MAXNPAPICNTRS]; + + /* cntrs_delta = current - counters_in */ + drhook_papi_subtract(cntrs_delta, NULL, keyptr->counters_in); + + /* keyptr->delta_counters_all += cntrs_delta */ + drhook_papi_add(NULL, keyptr->delta_counters_all,cntrs_delta); + } +#endif + } } return keyptr; @@ -3143,7 +3271,7 @@ itself(drhook_key_t *keyptr_self, /*--- commie -routines : adds "," i.e. comma after each 3 digit, e.g.: 1234567890 becomes more readable 1,234,567,890 */ -static void +static void lld_commie(long long int n, char sd[]) { const char comma = ','; @@ -3169,7 +3297,7 @@ lld_commie(long long int n, char sd[]) } } -static void +static void dbl_commie(double n, char sd[]) { const char comma = ','; @@ -3198,7 +3326,7 @@ dbl_commie(double n, char sd[]) /*--- callpath as a "pathname" ---*/ static void -unroll_callpath(FILE *fp, int len, +unroll_callpath(FILE *fp, int len, const equivalence_t *callpath, int callpath_len) { if (fp && callpath && callpath_len > 0) { @@ -3259,7 +3387,7 @@ static void do_prof() { /* to avoid recursive signals while atexit() (e.g. SIGXCPU) */ - if (signal_handler_ignore_atexit) return; + if (signal_handler_ignore_atexit) return; if (!do_prof_off && (opt_wallprof || opt_cpuprof)) { /* CPU or wall-clock profiling */ @@ -3270,6 +3398,15 @@ do_prof() c_drhook_print_(&ftnunitno, &master, &print_option, &initlev); } + if (!do_prof_off && (opt_papi)) { + /* CPU or wall-clock profiling */ + const int ftnunitno = 0; + const int master = 1; + const int print_option = 3; + int initlev = 0; + c_drhook_print_(&ftnunitno, &master, &print_option, &initlev); + } + if (!do_prof_off && opt_memprof) { /* Memory profiling */ const int ftnunitno = 0; @@ -3315,7 +3452,7 @@ typedef enum { /* See dr_hook_watch_mod.F90 */ KEY_I4 = 4, KEY_I8 = 8, KEY_R4 = 16, - KEY_R8 = 32 + KEY_R8 = 32 } PrintWatchKeys_t; static void print_watch(int ftnunitno, int key, const void *ptr, int n) @@ -3350,7 +3487,7 @@ static void print_watch(int ftnunitno, int key, const void *ptr, int n) } } -static void +static void check_watch(const char *label, const char *name, int name_len, @@ -3416,14 +3553,23 @@ c_drhook_check_watch_(const char *where, } /*** PUBLIC ***/ +#if defined(DR_HOOK_HAVE_PAPI) +#define PAPIREAD \ + if (opt_papi) { \ + long_long cntrs[MAXNPAPICNTRS]; \ + drhook_papi_readAll(cntrs); \ + } +#else +#define PAPIREAD /*NOOP*/ +#endif #define TIMERS \ double walltime = opt_walltime ? WALLTIME() : 0; \ double cputime = opt_cputime ? CPUTIME() : 0; \ long long int cycles = opt_cycles ? ec_get_cycles() : 0; \ long long int hwm = opt_gethwm ? gethwm_() : 0; \ - long long int stk = opt_getstk ? getstk_() : 0 - + long long int stk = opt_getstk ? getstk_() : 0; \ + PAPIREAD /*=== c_drhook_set_lhook_ ===*/ @@ -3435,12 +3581,12 @@ c_drhook_set_lhook_(const int *lhook) /*=== c_drhook_getenv_ ===*/ -void -c_drhook_getenv_(const char *s, +void +c_drhook_getenv_(const char *s, char *value, /* Hidden arguments */ int slen, - const int valuelen) + const int valuelen) { char *env = NULL; char *p = malloc_drhook(slen+1); @@ -3448,14 +3594,14 @@ c_drhook_getenv_(const char *s, fprintf(stderr,"c_drhook_getenv_(): Unable to allocate %d bytes of memory\n", slen+1); DRHOOK_ABORT(); } - memcpy(p,s,slen); + memcpy(p,s,slen); p[slen]='\0'; memset(value, ' ', valuelen); env = getenv(p); if (env) { int len = strlen(env); if (valuelen < len) len = valuelen; - memcpy(value,env,len); + memcpy(value,env,len); } free_drhook(p); } @@ -3471,7 +3617,7 @@ static void drhook_delete_lockfile() { } } -void +void c_drhook_init_(const char *progname, const int *num_threads /* Hidden length */ @@ -3480,7 +3626,7 @@ c_drhook_init_(const char *progname, init_drhook(*num_threads); //max_threads = MAX(1,*num_threads); if (a_out) free_drhook(a_out); - progname = trim(progname, &progname_len); + progname = trim(progname, &progname_len); if (progname_len > 0) { a_out = calloc_drhook(progname_len+1,sizeof(*progname)); memcpy(a_out, progname, progname_len); @@ -3506,6 +3652,10 @@ c_drhook_init_(const char *progname, tabort_delete_lockfile(); drhook_delete_lockfile(); } +#if defined(DR_HOOK_HAVE_PAPI) + if (opt_papi) drhook_papi_init(myproc -1); +#endif + } @@ -3525,7 +3675,7 @@ c_drhook_watch_(const int *onoff, { int tid = drhook_oml_get_thread_num(); drhook_watch_t *p = NULL; - if (!drhook_lhook) return; + if (!drhook_lhook) return; drhook_oml_set_lock(); @@ -3584,9 +3734,9 @@ c_drhook_watch_(const int *onoff, /*=== c_drhook_start_ ===*/ -void -c_drhook_start_(const char *name, - const int *thread_id, +void +c_drhook_start_(const char *name, + const int *thread_id, double *key, const char *filename, const int *sizeinfo @@ -3608,7 +3758,7 @@ c_drhook_start_(const char *name, dump_hugepages(0,pfx,tid,0,-1); } if (!opt_callpath) { - u.keyptr = getkey(*thread_id, name, name_len, + u.keyptr = getkey(*thread_id, name, name_len, filename, filename_len, &walltime, &cputime, &cycles, NULL, 0, NULL); @@ -3617,7 +3767,7 @@ c_drhook_start_(const char *name, int free_callpath = 1; int callpath_len = 0; equivalence_t *callpath = get_callpath(*thread_id, &callpath_len); - u.keyptr = getkey(*thread_id, name, name_len, + u.keyptr = getkey(*thread_id, name, name_len, filename, filename_len, &walltime, &cputime, &cycles, callpath, callpath_len, &free_callpath); @@ -3632,7 +3782,7 @@ c_drhook_start_(const char *name, (void) callstack(*thread_id, key, u.keyptr); } ITSELF_1; - if (opt_calltrace) { + if (opt_calltrace) { drhook_oml_set_lock(); { const int ftnunitno = 0; /* stderr */ @@ -3683,7 +3833,7 @@ c_drhook_start_(const char *name, /*=== c_drhook_end_ ===*/ -void +void c_drhook_end_(const char *name, const int *thread_id, const double *key, @@ -3750,7 +3900,7 @@ c_drhook_end_(const char *name, } /* if (opt_timeline_thread <= 0 || tid <= opt_timeline_thread) */ } if (watch && watch_count > 0) check_watch("when leaving routine", name, name_len, 1); - putkey(*thread_id, u.keyptr, name, name_len, + putkey(*thread_id, u.keyptr, name, name_len, *sizeinfo, &walltime, &cputime, &cycles); ITSELF_1; @@ -3788,7 +3938,7 @@ c_drhook_memcounter_(const int *thread_id, keyptr->mem_curdelta += *size; alldelta = keyptr->mem_curdelta + keyptr->mem_child; if (alldelta > keyptr->maxmem_alldelta) keyptr->maxmem_alldelta = alldelta; - if (keyptr->mem_curdelta > keyptr->maxmem_selfdelta) + if (keyptr->mem_curdelta > keyptr->maxmem_selfdelta) keyptr->maxmem_selfdelta = keyptr->mem_curdelta; if (keyptr_addr) { u.keyptr = keyptr; @@ -3811,7 +3961,7 @@ c_drhook_memcounter_(const int *thread_id, u.keyptr_addr = *keyptr_addr; keyptr = u.keyptr; } - else + else keyptr = curkeyptr[tid-1]; /* fprintf(stderr, @@ -3947,40 +4097,40 @@ trim_and_adjust_left(const char *p, int *name_len) return p; } -static void print_routine_name0(FILE * fp, const char * p_name, int p_tid, const char * p_filename, int p_cluster, - const equivalence_t * p_callpath, int p_callpath_len, int len, int cluster_size) +static void print_routine_name0(FILE * fp, const char * p_name, int p_tid, const char * p_filename, int p_cluster, + const equivalence_t * p_callpath, int p_callpath_len, int len, int cluster_size) { - int name_len = 0; - const char *name = trim_and_adjust_left(p_name,&name_len); + int name_len = 0; + const char *name = trim_and_adjust_left(p_name,&name_len); if (callpath_packed) { if (p_callpath && p_callpath_len > 0) { const equivalence_t * callpath = &p_callpath[p_callpath_len-1]; int j; - for (j=0; jkeyptr && callpath->keyptr->name) { const char *name = callpath->keyptr->name; int name_len = callpath->keyptr->name_len; fprintf(fp,"%.*s/",name_len,name); } - } - } - - fprintf(fp,"%.*s@%d%s%s", - name_len, name, - p_tid, - p_filename ? ":" : "", - p_filename ? p_filename : ""); - - if (opt_clusterinfo) { - fprintf(fp," [%d,%d]", - p_cluster, ABS(cluster_size)); - } - - if (!callpath_packed) - unroll_callpath(fp, len, p_callpath, p_callpath_len); - + } + } + + fprintf(fp,"%.*s@%d%s%s", + name_len, name, + p_tid, + p_filename ? ":" : "", + p_filename ? p_filename : ""); + + if (opt_clusterinfo) { + fprintf(fp," [%d,%d]", + p_cluster, ABS(cluster_size)); + } + + if (!callpath_packed) + unroll_callpath(fp, len, p_callpath, p_callpath_len); + } @@ -3996,9 +4146,9 @@ DrHookPrint(int ftnunitno, const char *line) { if (line) { FILE *fp = NULL; - if (ftnunitno <= 0) + if (ftnunitno <= 0) fp = stderr; - else if (ftnunitno == 6) + else if (ftnunitno == 6) fp = stdout; else dr_hook_prt_(&ftnunitno, line, strlen(line)); @@ -4006,11 +4156,11 @@ DrHookPrint(int ftnunitno, const char *line) } } -void +void c_drhook_print_(const int *ftnunitno, const int *thread_id, - const int *print_option, /* - 1=raw call counts + const int *print_option, /* + 1=raw call counts 2=calling tree 3=profiling info 4=memory profiling @@ -4074,14 +4224,14 @@ c_drhook_print_(const int *ftnunitno, } /* for (j=0; jactive)) { - int do_print = (*print_option == 2 || + int do_print = (*print_option == 2 || abs_print_option == 7 || abs_print_option == 5 || abs_print_option == 6); if (do_print) { @@ -4135,7 +4285,7 @@ c_drhook_print_(const int *ftnunitno, default: case 2: kind = ':'; is_timeline = 0; break; } - if (*print_option == 2 || + if (*print_option == 2 || (is_timeline && tid > 1 && tid <= opt_timeline_thread)) { sprintf(s,"%s %s [DrHookCallTree] %s%c ", pfx,TIMESTR(tid), @@ -4301,6 +4451,16 @@ c_drhook_print_(const int *ftnunitno, drhook_key_t *keyptr = &keydata[t][j]; while (keyptr) { if (keyptr->name && (keyptr->status == 0 || signal_handler_called)) { +#if defined(DR_HOOK_HAVE_PAPI) + /* No point slowing down this code with an if (opt_papi) + * as it can be called by signal_drhook(). This would just be + * processing zeros anyway as we only use calloc() for keys */ + drhook_papi_subtract(p->counter_self, + keyptr->delta_counters_all, + keyptr->delta_counters_child + ); + drhook_papi_cpy(p->counter_tot, keyptr->delta_counters_all); +#endif p->self = opt_wallprof ? keyptr->delta_wall_all - keyptr->delta_wall_child : keyptr->delta_cpu_all - keyptr->delta_cpu_child; @@ -4338,8 +4498,11 @@ c_drhook_print_(const int *ftnunitno, int *clusize = calloc_drhook(nprof+1, sizeof(*clusize)); /* make sure at least 1 element */ char *prevname = NULL; const char *fmt = "%5d %8.2f %12.3f %12.3f %12.3f %14llu %11.2f %11.2f %s"; + const char *csvfmt = "%s,%d,%d,%d,%.4f,%.6f,%.6f,%.6f,%llu"; char *filename = get_mon_out(myproc); + char *csvfilename = get_csv_out(myproc); FILE *fp = NULL; + FILE *fpcsv = NULL; if (!filename) break; @@ -4349,14 +4512,25 @@ c_drhook_print_(const int *ftnunitno, pfx,TIMESTR(tid),FFL, myproc,filename); } - fp = fopen(filename,"w"); if (!fp) goto finish_3; - + + if (opt_papi==1){ + if ((myproc == 1 && mon_out_procs == -1) || mon_out_procs == myproc) { + fprintf(stderr, + "%s %s [%s@%s:%d] Writing counter information of proc#%d into file '%s'\n", + pfx,TIMESTR(tid),FFL, + myproc,csvfilename + ); + } + fpcsv = fopen(csvfilename,"w"); + if (!fpcsv) goto finish_3; + } + /* alphanumerical sorting to find out clusters of the same routine but on different threads */ /* also find out total wall clock time */ /* calculate percentage values */ - + p = prof; qsort(p, nprof, sizeof(*p), prof_name_comp); @@ -4509,35 +4683,38 @@ c_drhook_print_(const int *ftnunitno, fprintf(fp,"\n"); { - len = + len = fprintf(fp," # %% Time Cumul Self Total # of calls Self Total "); } - fprintf(fp,"Routine@"); - if (opt_clusterinfo) fprintf(fp," [Cluster:(id,size)]"); - fprintf(fp,"\n"); - if (opt_sizeinfo) fprintf(fp,"%*s %s\n",len-20," ","(Size; Size/sec; Size/call; MinSize; MaxSize)"); - fprintf(fp, " (self) (sec) (sec) (sec) ms/call ms/call\n"); - fprintf(fp,"\n"); + fprintf(fp, "Routine@"); + if (opt_clusterinfo) + fprintf(fp, " [Cluster:(id,size)]"); + fprintf(fp, "\n"); + if (opt_sizeinfo) + fprintf(fp, "%*s %s\n",len-20," ","(Size; Size/sec; Size/call; MinSize; MaxSize)"); + fprintf(fp, " (self) (sec) (sec) (sec) ms/call ms/call\n"); + fprintf(fp, "\n"); cumul = 0; for (j=0; jcluster]; - if (p->pc < percent_limit) break; + if (p->pc < percent_limit) + break; if (opt_cputime) { cumul += p->self; + } else { + if (p->is_max || cluster_size == 1) + cumul += p->self; } - else { - if (p->is_max || cluster_size == 1) cumul += p->self; - } - { + + { fprintf(fp, fmt, - ++j, p->pc, cumul, p->self, p->total, p->calls, - p->percall_ms_self, p->percall_ms_total, - p->is_max ? "*" : " "); + ++j, p->pc, cumul, p->self, p->total, p->calls, + p->percall_ms_self, p->percall_ms_total, + p->is_max ? "*" : " "); } - print_routine_name(fp, p, len, cluster_size); - + if (opt_sizeinfo && p->sizeinfo > 0) { char s1[DRHOOK_STRBUF], s2[DRHOOK_STRBUF], s3[DRHOOK_STRBUF]; char s4[DRHOOK_STRBUF], s5[DRHOOK_STRBUF]; @@ -4551,8 +4728,63 @@ c_drhook_print_(const int *ftnunitno, fprintf(fp,"\n"); p++; } /* for (j=0; jcluster]; + if (opt_cputime) + cumul += p->self; + else + if (p->is_max || cluster_size == 1) cumul += p->self; + + { + fprintf(fpcsv, csvfmt, + p->name, + myproc-1, + p->tid-1, + ++j, p->pc, cumul, p->self, p->total, p->calls, + p->is_max ? "*" : " " + ); + for (int c=0;ccounter_self[c]); + for (int c=0;ccounter_tot[c]); + if (first_counter_is_cyc==1) + fprintf(fpcsv,",%.3f,%.3f", + p->counter_self[0]/p->self/1000000.0, + p->counter_tot[0]/p->total/1000000.0 + ); + } + fprintf(fpcsv, "\n"); + p++; + } /* for (j=0; j"); @@ -4778,10 +5010,10 @@ c_drhook_print_(const int *ftnunitno, t = p->tid - 1; if (p->children > maxseen_tot[t]) p->children = maxseen_tot[t]; /* adjust */ fprintf(fp, fmt, - ++j, p->pc, + ++j, p->pc, p->self, p->children, p->leaked, p->hwm, p->stk, p->pag, - p->calls, p->alloc_count, + p->calls, p->alloc_count, (p->alloc_count - p->free_count != 0) ? "*" : " ", p->free_count, p->is_max ? "*" : " "); @@ -4790,7 +5022,7 @@ c_drhook_print_(const int *ftnunitno, fprintf(fp,"\n"); p++; } /* for (j=0; j 0 ? name_len : (int)strlen(name), filename_len > 0 ? filename_len : (int)strlen(filename)); } else if (option == 1) { - c_drhook_end_(name, &tid, handle, + c_drhook_end_(name, &tid, handle, filename, &sizeinfo, name_len > 0 ? name_len : (int)strlen(name), filename_len > 0 ? filename_len : (int)strlen(filename)); } } -/* +/* this is result of moving some code from libodb.a (odb/aux/util_ccode.c) for use by libifsaux.a directly ; simplifies linking sequences. @@ -4917,7 +5149,7 @@ double util_cputime_() } return (tbuf.tms_utime + tbuf.tms_stime + - tbuf.tms_cutime + tbuf.tms_cstime) / clock_ticks; + tbuf.tms_cutime + tbuf.tms_cstime) / clock_ticks; } int util_ihpstat_(int *option) @@ -4939,13 +5171,13 @@ static void set_timed_kill() int nelems = sscanf(p,"%d:%d:%d:%lf", &target_myproc, &target_omltid, &target_sig, &start_time); int ntids = drhook_oml_get_max_threads(); - if (nelems == 4 && + if (nelems == 4 && (target_myproc == myproc || target_myproc == -1) && (target_omltid == -1 || (target_omltid >= 1 && target_omltid <= ntids)) && (target_sig >= 1 && target_sig <= NSIG) && start_time > 0) { if (ntids > 1) { - extern void drhook_run_omp_parallel_ipfipipipdpstr_(const int *, + extern void drhook_run_omp_parallel_ipfipipipdpstr_(const int *, void (*func)(const int *, const int *, const int *, const double *, const char *, long), const int *, const int *, const int *, const double *, const char *, long); drhook_run_omp_parallel_ipfipipipdpstr_(&ntids,set_killer_timer, diff --git a/src/fiat/drhook/extensions/papi/drhook_papi.c b/src/fiat/drhook/extensions/papi/drhook_papi.c new file mode 100644 index 00000000..d6e51944 --- /dev/null +++ b/src/fiat/drhook/extensions/papi/drhook_papi.c @@ -0,0 +1,340 @@ +/* + * (C) Copyright 2024- ECMWF. + * + * This software is licensed under the terms of the Apache Licence Version 2.0 + * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + * In applying this licence, ECMWF does not waive the privileges and immunities + * granted to it by virtue of its status as an intergovernmental organisation + * nor does it submit to any jurisdiction. + */ + +#include "drhook_papi.h" +#include +#include +#include +#include +#include "oml.h" + +#define STD_MSG_LEN 4096 + +static int silent = 0; + +int* drhook_papi_event_set=NULL; +enum { + drhook_papi_notstarted, + drhook_papi_running, + drhook_papi_failed +}; +int drhook_papi_state=drhook_papi_notstarted; +int drhook_papi_rank=0; /* C style! */ + +static int papi_counter_event_codes[MAXNPAPICNTRS]; +static const char* papi_counter_names[MAXNPAPICNTRS]; +static int papi_counters_count = 0; + +/* function to use for thread id + - it should be better than omp_get_thread_num! +*/ +unsigned long papi_safe_thread_num(){ + return oml_my_thread()-1; +} + +int drhook_papi_max_num_counters() { + return MAXNPAPICNTRS; +} + +int drhook_papi_max_name_len() { + return PAPI_MAX_STR_LEN; +} + +void drhook_papi_counter_name(int c, char* event_name){ + PAPI_event_code_to_name(papi_counter_event_codes[c], event_name); +} + +void drhook_papi_cpy(long_long* a,long_long* b){ + for (int i=0;i0){ + char fmt[STD_MSG_LEN]; + sprintf(fmt,"%%%lds",strlen(s)); + sprintf(msg,fmt," "); + for (int i=0;i 0) { + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, library version mismatch between compilation and run!\n"); + printf("%s\n",pmsg); + return 0; + } + if (paperr == PAPI_EINVAL){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, PAPI_EINVAL\n"); + printf("%s\n",pmsg); + return 0; + } + if (paperr == PAPI_ENOMEM){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, PAPI_ENOMEM\n"); + printf("%s\n",pmsg); + return 0; + } + if (paperr == PAPI_ESBSTR){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, PAPI_ESBSTR\n"); + printf("%s\n",pmsg); + return 0; + } + if (paperr == PAPI_ESYS){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, PAPI_ESYS\n"); + printf("%s\n",pmsg); + return 0; + } + else { + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, unknown error code: %d\n", paperr); + printf("%s\n",pmsg); + return 0; + } + } + + lib_version = PAPI_get_opt( PAPI_LIB_VERSION, NULL ); + + int nthreads=oml_get_max_threads(); + + paperr=PAPI_thread_init(papi_safe_thread_num); + + if( paperr != PAPI_OK ){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, thread init failed (%s)",PAPI_strerror(paperr)); + printf("%s\n",pmsg); + return 0; + } + + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Version %d.%d.%d initialised with %d threads", + PAPI_VERSION_MAJOR( lib_version ), + PAPI_VERSION_MINOR( lib_version ), + PAPI_VERSION_REVISION( lib_version ), + nthreads); + + if (drhook_papi_rank==0 && !silent) printf("%s\n",pmsg); + + drhook_papi_event_set=malloc_drhook(nthreads*sizeof(int)); + + int rcout; + drhook_run_omp_parallel_papi_startup(drhook_papi_event_set, nthreads, &rcout); + if (rcout) + return 0; + + for (int i=0; i < drhook_papi_max_num_counters(); i++) + free((void *) papi_counter_names[i]); + + drhook_papi_state=drhook_papi_running; + if (drhook_papi_rank==0 && !silent) printf("DRHOOK:PAPI: Initialisation sucess\n"); + return 1; +} + +int drhook_papi_start_threads(int* events){ + int thread=papi_safe_thread_num(); + int papiErr; + char pmsg[STD_MSG_LEN]; + + events[thread]=PAPI_NULL; + papiErr=PAPI_create_eventset(&events[thread]); + if (papiErr != PAPI_OK){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, create event set failed (%s) \n",PAPI_strerror(papiErr)); + printf("%s\n",pmsg); + return 0; + } + + if (!silent) printf("DRHOOK:PAPI: Event set %d created for thread %d\n",events[thread],thread); + + if (!silent && drhook_papi_rank==0 && thread==0) + printf("DRHOOK:PAPI: Attempting to add events to event set:\n"); + + for (int counter=0; counter < drhook_papi_num_counters(); counter ++){ + int eventCode; + + if (!silent && drhook_papi_rank==0 && thread==0) { + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: %s", papi_counter_names[counter]); + printf("%s\n",pmsg); + } + + papiErr=PAPI_event_name_to_code(papi_counter_names[counter], &eventCode); + if (papiErr != PAPI_OK){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, event name to code failed for %s (%s)", papi_counter_names[counter], PAPI_strerror(papiErr)); + printf("%s\n",pmsg); + PAPI_perror("initPapi"); + return 0; + } + + papi_counter_event_codes[counter] = eventCode; + papiErr=PAPI_add_event(events[thread],eventCode); + if (papiErr!=PAPI_OK){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error, add_event failed: %d (%s)",papiErr,PAPI_strerror(papiErr)); + printf("%s\n",pmsg); + if (papiErr == PAPI_EINVAL) + printf("Invalid argument\n"); + else if (papiErr == PAPI_ENOMEM) + printf("Out of memory\n"); + else if (papiErr == PAPI_ENOEVST) + printf("EventSet does not exist\n"); + else if (papiErr == PAPI_EISRUN) + printf("EventSet is running\n"); + else if (papiErr == PAPI_ECNFLCT) + printf("Conflict\n"); + else if (papiErr == PAPI_ENOEVNT) + printf("Preset not available\n"); + return 0; + } + else { +#if defined(DEBUG) + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Added code=%d to Event set %d",eventCode, events[thread]); + if (thread==0) { + printf("%s\n",pmsg); + } +#endif + } + } + + int number = drhook_papi_num_counters(); + int* checkEvents=malloc(drhook_papi_num_counters()*sizeof(int)); + papiErr = PAPI_list_events(events[thread], checkEvents, &number); + if (papiErr != PAPI_OK){ + snprintf(pmsg,STD_MSG_LEN,"DRHOOK:PAPI: Error querying events - %d=%s",papiErr,PAPI_strerror(papiErr)); + printf("%s\n",pmsg); + return 0; + } +#if defined(DEBUG) + for (int counter=0;counter + +#define MAXNPAPICNTRS 4 + +int drhook_papi_init(int rank); +int drhook_papi_num_counters(); +int drhook_papi_max_num_counters(); +int drhook_papi_max_name_len(); +void drhook_papi_counter_name(int c, char* event_name); +void drhook_papi_add_counter_name(const char* counter_name); +long_long drhook_papi_read(int counterId); +int drhook_papi_readAll(long_long* counterArray); + +/* implemented in fortran */ +int drhook_run_omp_parallel_papi_startup(int* drhook_papi_event_set, int nthreads, int* rcout); + +/* a = b - c +if b or c == NULL means use current readings + */ +void drhook_papi_subtract(long_long* a, long_long* b, long_long* c); + +/* a = b + c +if a==NULL, b=b+c */ +void drhook_papi_add(long_long* a, long_long* b, long_long* c); + +/* a = b */ +void drhook_papi_cpy(long_long* a, long_long* b); + +/* a=0 */ +void drhook_papi_bzero(long_long* a); + +void drhook_papi_print(char* s, long_long* a, int header); + +#endif diff --git a/src/fiat/drhook/internal/dr_hook_procinfo.F90 b/src/fiat/drhook/internal/dr_hook_procinfo.F90 index 9f36e78b..a469c0c0 100644 --- a/src/fiat/drhook/internal/dr_hook_procinfo.F90 +++ b/src/fiat/drhook/internal/dr_hook_procinfo.F90 @@ -8,16 +8,16 @@ ! nor does it submit to any jurisdiction. ! -SUBROUTINE DR_HOOK_PROCINFO(KMYPROC, KNPROC) +SUBROUTINE DR_HOOK_PROCINFO(KMYPROC, KNPROC, LMPI_INITIALIZED) USE EC_PARKIND ,ONLY : JPIM USE MPL_MPIF IMPLICIT NONE -INTEGER(KIND=JPIM),INTENT(OUT) :: KMYPROC, KNPROC -LOGICAL :: LMPI_INITIALIZED +INTEGER(KIND=JPIM), INTENT(OUT) :: KMYPROC, KNPROC +LOGICAL, INTENT(OUT) :: LMPI_INITIALIZED INTEGER(KIND=JPIM) :: IERROR -KMYPROC=-1 -KNPROC=0 +KMYPROC=1 +KNPROC=1 CALL MPI_INITIALIZED(LMPI_INITIALIZED,IERROR) IF( LMPI_INITIALIZED ) THEN CALL MPI_COMM_SIZE(MPI_COMM_WORLD,KNPROC,IERROR) @@ -25,3 +25,14 @@ SUBROUTINE DR_HOOK_PROCINFO(KMYPROC, KNPROC) KMYPROC = KMYPROC+1 ! 1-based in IFS context ENDIF END SUBROUTINE DR_HOOK_PROCINFO + +SUBROUTINE C_DR_HOOK_PROCINFO(KMYPROC, KNPROC, KMPI_INITIALIZED) BIND(C, name="c_dr_hook_procinfo") + USE, INTRINSIC :: ISO_C_BINDING, ONLY : C_INT + IMPLICIT NONE + INTEGER(KIND=C_INT), INTENT(OUT) :: KMYPROC, KNPROC, KMPI_INITIALIZED + LOGICAL :: LLMPI_INITIALIZED + + CALL DR_HOOK_PROCINFO(KMYPROC, KNPROC, LLMPI_INITIALIZED) + + KMPI_INITIALIZED = MERGE(1, 0, LLMPI_INITIALIZED) +END SUBROUTINE C_DR_HOOK_PROCINFO diff --git a/src/fiat/drhook/internal/drhook_run_omp_parallel.F90 b/src/fiat/drhook/internal/drhook_run_omp_parallel.F90 index 36874633..0d1170f0 100644 --- a/src/fiat/drhook/internal/drhook_run_omp_parallel.F90 +++ b/src/fiat/drhook/internal/drhook_run_omp_parallel.F90 @@ -10,6 +10,18 @@ ! These functions are to be used within drhook C methods, to avoid having OMP pragmas there. +module drhook_papi_interface +#if defined(DR_HOOK_HAVE_PAPI) + interface + function drhook_papi_start_threads ( events) bind ( c ) + use, intrinsic :: iso_c_binding, only : c_int + integer(kind=c_int) :: drhook_papi_start_threads + integer(kind=c_int), intent(inout) :: events(*) + end function drhook_papi_start_threads + end interface +#endif +end module drhook_papi_interface + subroutine drhook_run_omp_parallel_ipfstr(NTIDS, FUNC, CDSTR) ! Usage: ! ------ @@ -61,3 +73,34 @@ subroutine drhook_run_omp_parallel_get_cycles(NTIDS, NCYCLES) NCYCLES(IOMPTID) = ICYCLES - NCYCLES(IOMPTID) !$OMP END PARALLEL end subroutine drhook_run_omp_parallel_get_cycles + +#if defined(DR_HOOK_HAVE_PAPI) + +subroutine drhook_run_omp_parallel_papi_startup(events, n, rcOut) bind(c) + use, intrinsic :: iso_c_binding, only : c_char, c_int, c_double + use drhook_papi_interface + use OML_MOD + implicit none + INTEGER(KIND=C_INT), INTENT(INOUT) :: Events(n) + INTEGER(KIND=C_INT), VALUE, INTENT(IN) :: n + INTEGER(KIND=C_INT) :: thread + INTEGER(KIND=C_INT) :: rc + INTEGER(KIND=C_INT), INTENT(OUT) :: rcOut + INTEGER :: myThread + INTEGER :: nThreads + + nThreads=OML_GET_MAX_THREADS() + rcOut=0 + !$OMP PARALLEL PRIVATE(myThread,rc) SHARED(rcOut) + myThread=OML_MY_THREAD()-1 + DO thread=0,nThreads-1 + if (thread==myThread) then + rc=drhook_papi_start_threads(events) + if (rc==0) rcOut=1 + end if + !$OMP BARRIER + END DO + !$OMP END PARALLEL + +end subroutine drhook_run_omp_parallel_papi_startup +#endif diff --git a/src/fiat/include/fiat/drhook.h b/src/fiat/include/fiat/drhook.h index bafb78e5..82d292e3 100644 --- a/src/fiat/include/fiat/drhook.h +++ b/src/fiat/include/fiat/drhook.h @@ -187,7 +187,7 @@ dr_hook_prt_(const int *ftnunitno, , int s_len); extern void -dr_hook_procinfo_(int *myproc, int *nproc); +c_dr_hook_procinfo(int *myproc, int *nproc, int *mpi_init); #ifdef __cplusplus } // extern "C" diff --git a/src/fiat/system/internal/opfla_perfmon.c b/src/fiat/system/internal/opfla_perfmon.c index 56623156..5899a9c7 100644 --- a/src/fiat/system/internal/opfla_perfmon.c +++ b/src/fiat/system/internal/opfla_perfmon.c @@ -151,7 +151,7 @@ int report_init(int periodicreport){ rc = (num = PAPI_num_counters()); if (rc != PAPI_OK) { - PAPI_perror(rc, "PAPI_num_counters", strlen("PAPI_num_counters")); + PAPI_perror("PAPI_num_counters"); } //fprintf(stderr,"PAPI_num_counters = %d\n",num); @@ -348,10 +348,12 @@ void common_inits() //initialize PAPI counters without periodic reporting fprintf(stderr,"Calling report_init(0)\n"); init_error=report_init(0); - if (init_error) fprintf(stderr, - "Unable to init PAPI counters (init_error=%d) : %s\n", - init_error, - PAPI_strerror(init_error)); + if (init_error) + fprintf(stderr, + "Unable to init PAPI counters (init_error=%d) : %s\n", + init_error, + PAPI_strerror(init_error) + ); } } diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index abee6f50..dd4696d3 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -55,6 +55,7 @@ find_package( OpenMP COMPONENTS C ) if( TARGET OpenMP::OpenMP_C ) target_link_libraries( fiat-printbinding OpenMP::OpenMP_C ) endif() + find_package( MPI COMPONENTS C ) if( HAVE_MPI AND TARGET MPI::MPI_C ) target_link_libraries( fiat-printbinding MPI::MPI_C ) @@ -67,4 +68,3 @@ check_c_compiler_flag( "-Wno-implicit-function-declaration" disable_warning_impl if( disable_warning_implicit_function_declaration ) target_compile_options( fiat-printbinding PRIVATE "-Wno-implicit-function-declaration" ) endif() - diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c0e0f039..73cfbd7b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -8,6 +8,8 @@ if( HAVE_TESTS ) +set( CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) + if( HAVE_MPI AND MPIEXEC ) set( LAUNCH ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ) else() @@ -101,6 +103,46 @@ set_tests_properties(fiat_test_drhook_fortran PROPERTIES ENVIRONMENT "MPL=0;DR_HOOK_ASSERT_MPI_INITIALIZED=0;DR_HOOK_OPT=NOPROPAGATE_SIGNALS" PASS_REGULAR_EXPRESSION "EC_DRHOOK.*\[DrHookCallTree\]" ) + +# ---------------------------------------------------------------------------------------- +# Tests: fiat_test_drhook_counters + +if( HAVE_DR_HOOK_PAPI ) + ecbuild_add_test(TARGET fiat_test_drhook_counters + SOURCES test_drhook_counters.F90 + test_drhook_counters_stream.F90 + test_drhook_counters_gemm.F90 + LIBS fiat + ENVIRONMENT "DR_HOOK_ASSERT_MPI_INITIALIZED=0;FIAT_UNIT_TEST=1" + ) + target_compile_definitions( fiat_test_drhook_counters PRIVATE OMP ) + if( NOT HAVE_MPI ) + target_compile_definitions( fiat_test_drhook_counters PRIVATE NOMPI ) + endif() + + if( CMAKE_Fortran_COMPILER_ID MATCHES Intel ) + set_source_files_properties(test_drhook_counters_stream.F90 + PROPERTIES COMPILE_OPTIONS "-qopt-prefetch-distance=64,12;-qopt-streaming-cache-evict=0;-qopt-streaming-stores always;-qopt-zmm-usage=high") + endif() + + find_package( OpenMP COMPONENTS Fortran ) + if( TARGET OpenMP::OpenMP_Fortran ) + target_link_libraries( fiat_test_drhook_counters OpenMP::OpenMP_Fortran ) + endif() + if( NOT BLAS_LIBRARIES ) + find_package( MKL QUIET ) + if( MKL_LIBRARIES ) + set( BLAS_LIBRARIES ${MKL_LIBRARIES} ) + else() + find_package( BLAS QUIET ) + endif() + endif() + if( BLAS_LIBRARIES ) + target_link_libraries( fiat_test_drhook_counters ${BLAS_LIBRARIES} ) + target_compile_definitions( fiat_test_drhook_counters PUBLIC HAVE_BLAS ) + endif() +endif() + # ---------------------------------------------------------------------------------------- # Tests: fiat_test_ec_args_fortran diff --git a/tests/drhook/CMakeLists.txt b/tests/drhook/CMakeLists.txt index f41fa5b2..ca619dfc 100644 --- a/tests/drhook/CMakeLists.txt +++ b/tests/drhook/CMakeLists.txt @@ -84,4 +84,9 @@ ecbuild_add_test( TARGET fiat_test_drhook_ex5 # NVTX if (HAVE_DR_HOOK_NVTX) add_subdirectory(drhook_nvtx) +endif () + +# PAPI +if (HAVE_DR_HOOK_PAPI) + add_subdirectory(drhook_papi) endif () \ No newline at end of file diff --git a/tests/drhook/drhook_nvtx/CMakeLists.txt b/tests/drhook/drhook_nvtx/CMakeLists.txt index 9d186b65..ef24e16d 100644 --- a/tests/drhook/drhook_nvtx/CMakeLists.txt +++ b/tests/drhook/drhook_nvtx/CMakeLists.txt @@ -57,8 +57,10 @@ ecbuild_add_test( TARGET fiat_test_drhook_nvtx_mismatched_regions ENVIRONMENT DR_HOOK=1 DR_HOOK_NVTX=1 CONDITION HAVE_DR_HOOK_NVTX ) -set_tests_properties(fiat_test_drhook_nvtx_mismatched_regions - PROPERTIES WILL_FAIL TRUE ) +if (TEST fiat_test_drhook_nvtx_mismatched_regions) + set_tests_properties(fiat_test_drhook_nvtx_mismatched_regions + PROPERTIES WILL_FAIL TRUE ) +endif() # Test skip on spammy regions @@ -76,8 +78,10 @@ ecbuild_add_test( TARGET fiat_test_drhook_nvtx_skip_spam_regions ENVIRONMENT DR_HOOK=1 DR_HOOK_NVTX=1 DR_HOOK_SILENT=0 CONDITION HAVE_DR_HOOK_NVTX ) -set_tests_properties(fiat_test_drhook_nvtx_skip_spam_regions - PROPERTIES PASS_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping closing of region foo" PASS_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping opening of region foo" ) +if (TEST fiat_test_drhook_nvtx_skip_spam_regions) + set_tests_properties(fiat_test_drhook_nvtx_skip_spam_regions + PROPERTIES PASS_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping closing of region foo" PASS_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping opening of region foo" ) +endif() # Test not to skip on spammy regions with long runtimes @@ -95,5 +99,7 @@ ecbuild_add_test( TARGET fiat_test_drhook_nvtx_no_skip_spam_regions ENVIRONMENT DR_HOOK=1 DR_HOOK_NVTX=1 DR_HOOK_SILENT=0 CONDITION HAVE_DR_HOOK_NVTX ) -set_tests_properties(fiat_test_drhook_nvtx_no_skip_spam_regions - PROPERTIES FAIL_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping closing of region foo" FAIL_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping opening of region foo" ) +if (TEST fiat_test_drhook_nvtx_no_skip_spam_regions) + set_tests_properties(fiat_test_drhook_nvtx_no_skip_spam_regions + PROPERTIES FAIL_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping closing of region foo" FAIL_REGULAR_EXPRESSION "DRHOOK:NVTX: Skipping opening of region foo" ) +endif() diff --git a/tests/drhook/drhook_papi/CMakeLists.txt b/tests/drhook/drhook_papi/CMakeLists.txt new file mode 100644 index 00000000..0f641e97 --- /dev/null +++ b/tests/drhook/drhook_papi/CMakeLists.txt @@ -0,0 +1,138 @@ +# +# (C) Copyright 2024- ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +# Test basic implementation + +ecbuild_add_executable( TARGET drhook_papi_basic + SOURCES drhook_papi_basic.F90 + LIBS fiat + LINKER_LANGUAGE Fortran + CONDITION HAVE_DR_HOOK_PAPI + NOINSTALL ) + +ecbuild_add_test( TARGET fiat_test_drhook_papi_basic + COMMAND drhook_papi_basic + ENVIRONMENT DR_HOOK=1 DR_HOOK_OPT=COUNTERS + CONDITION HAVE_DR_HOOK_PAPI ) + +if (TEST fiat_test_drhook_papi_basic) + set_tests_properties(fiat_test_drhook_papi_basic + PROPERTIES PASS_REGULAR_EXPRESSION "Writing counter information of proc#1 into file" ) +endif() + + +ecbuild_add_test( TARGET fiat_test_drhook_papi_basic_valid_csv + TYPE SCRIPT + # Just making sure it's not an empty file + COMMAND "find" + ARGS "." "-name" "drhook.prof.1.csv" "-type" "f" "-size" "+100c" + CONDITION HAVE_DR_HOOK_PAPI ) + +if (TEST fiat_test_drhook_papi_basic_valid_csv) + set_tests_properties(fiat_test_drhook_papi_basic_valid_csv + PROPERTIES DEPENDS fiat_test_drhook_papi_basic + PASS_REGULAR_EXPRESSION "drhook.prof.1.csv" ) +endif() + +# Test MPI implementation + +ecbuild_add_executable( TARGET drhook_papi_mpi + SOURCES drhook_papi_mpi.F90 + LIBS fiat + LINKER_LANGUAGE Fortran + CONDITION HAVE_DR_HOOK_PAPI AND HAVE_MPI + NOINSTALL ) + +ecbuild_add_test( TARGET fiat_test_drhook_papi_mpi + COMMAND drhook_papi_mpi + MPI 5 + ENVIRONMENT DR_HOOK=1 DR_HOOK_OPT=COUNTERS DR_HOOK_PROFILE=fiat_test_drhook_papi_mpi + CONDITION HAVE_DR_HOOK_PAPI AND HAVE_MPI ) + + +ecbuild_add_test( TARGET fiat_test_drhook_papi_mpi_valid_csv + TYPE SCRIPT + # Just making sure it's not an empty file + # We have to do this weird thing with bash so that we can + # use a redirect. CMake tests are really basic... + COMMAND "bash" + ARGS "-c" "find . -name 'fiat_test_drhook_papi_mpi.[1-5].csv' -type f -size +100c | wc -l" + CONDITION HAVE_DR_HOOK_PAPI AND HAVE_MPI ) + +if (TEST fiat_test_drhook_papi_mpi_valid_csv) + set_tests_properties(fiat_test_drhook_papi_mpi_valid_csv + PROPERTIES DEPENDS fiat_test_drhook_papi_mpi + PASS_REGULAR_EXPRESSION "5" ) +endif() + +# Test user specified output file names + +ecbuild_add_executable( TARGET drhook_papi_user_filename + SOURCES drhook_papi_user_filename.F90 + LIBS fiat + LINKER_LANGUAGE Fortran + CONDITION HAVE_DR_HOOK_PAPI + NOINSTALL ) + +ecbuild_add_test( TARGET fiat_test_drhook_papi_user_filename + COMMAND drhook_papi_user_filename + ENVIRONMENT DR_HOOK=1 DR_HOOK_OPT=COUNTERS DR_HOOK_PROFILE=fiat_test_drhook_papi_user_filename + CONDITION HAVE_DR_HOOK_PAPI ) + +ecbuild_add_test( TARGET fiat_test_drhook_papi_user_filename_valid_csv + TYPE SCRIPT + # Just making sure it's not an empty file + COMMAND "find" + ARGS "." "-name" "fiat_test_drhook_papi_user_filename.1.csv" "-type" "f" + CONDITION HAVE_DR_HOOK_PAPI ) + +if (TEST fiat_test_drhook_papi_user_filename_valid_csv) + set_tests_properties(fiat_test_drhook_papi_user_filename_valid_csv + PROPERTIES DEPENDS fiat_test_drhook_papi_user_filename + PASS_REGULAR_EXPRESSION "fiat_test_drhook_papi_user_filename.1.csv" ) +endif() + +# Test user specified counters + +ecbuild_add_executable( TARGET drhook_papi_user_counters + SOURCES drhook_papi_user_counters.F90 + LIBS fiat + LINKER_LANGUAGE Fortran + CONDITION HAVE_DR_HOOK_PAPI + NOINSTALL ) + +ecbuild_add_test( TARGET fiat_test_drhook_papi_user_counters + COMMAND drhook_papi_user_counters + ENVIRONMENT DR_HOOK=1 DR_HOOK_OPT=COUNTERS DR_HOOK_PAPI_COUNTERS=PAPI_TOT_INS DR_HOOK_PROFILE=fiat_test_drhook_papi_user_counters + CONDITION HAVE_DR_HOOK_PAPI ) + +if (TEST fiat_test_drhook_papi_user_counters) + set_tests_properties(fiat_test_drhook_papi_user_counters + PROPERTIES PASS_REGULAR_EXPRESSION "PAPI_TOT_INS" ) +endif() + +# Test user specified counters going over max allowed + +ecbuild_add_executable( TARGET drhook_papi_user_counters_more_than_max + SOURCES drhook_papi_user_counters_more_than_max.F90 + LIBS fiat + LINKER_LANGUAGE Fortran + CONDITION HAVE_DR_HOOK_PAPI + NOINSTALL ) + +ecbuild_add_test( TARGET fiat_test_drhook_papi_user_counters_more_than_max + COMMAND drhook_papi_user_counters_more_than_max + ENVIRONMENT DR_HOOK=1 DR_HOOK_OPT=COUNTERS DR_HOOK_PAPI_COUNTERS=PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_L1_DCA,PAPI_L2_DCM,PAPI_TOT_INS DR_HOOK_PROFILE=drhook_papi_user_counters_more_than_max + CONDITION HAVE_DR_HOOK_PAPI ) + +if (TEST fiat_test_drhook_papi_user_counters_more_than_max) + set_tests_properties(fiat_test_drhook_papi_user_counters_more_than_max + PROPERTIES FAIL_REGULAR_EXPRESSION "PAPI_TOT_INS" ) +endif() diff --git a/tests/drhook/drhook_papi/drhook_papi_basic.F90 b/tests/drhook/drhook_papi/drhook_papi_basic.F90 new file mode 100644 index 00000000..ed200208 --- /dev/null +++ b/tests/drhook/drhook_papi/drhook_papi_basic.F90 @@ -0,0 +1,27 @@ +! (C) Copyright 2024- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +program drhook_papi_basic + + use yomhook, only : jphook, dr_hook + + implicit none + + real(jphook) :: zhook_handle + integer :: a + + call dr_hook('drhook_papi_basic', 0, zhook_handle) + + a = 1 + a = a + a + + call dr_hook('drhook_papi_basic', 1, zhook_handle) + +end program drhook_papi_basic + diff --git a/tests/drhook/drhook_papi/drhook_papi_mpi.F90 b/tests/drhook/drhook_papi/drhook_papi_mpi.F90 new file mode 100644 index 00000000..dce412f9 --- /dev/null +++ b/tests/drhook/drhook_papi/drhook_papi_mpi.F90 @@ -0,0 +1,34 @@ +! (C) Copyright 2024- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +program drhook_papi_mpi + use mpl_module + use yomhook, only : jphook, dr_hook + use sdl_mod, only : sdl_traceback + implicit none + integer jpe, npes, mype, a + character(len=256) arg, env + real(jphook) :: zhook_handle + + call mpl_init(ldinfo=.false.) + call dr_hook('drhook_papi_mpi',0,zhook_handle) + + npes = mpl_nproc() + mype = mpl_myrank() + + do jpe=1,npes + if (mype == jpe) then + a = a + jpe + endif + enddo + + call mpl_barrier() + call dr_hook('drhook_papi_mpi',1,zhook_handle) + call mpl_end() +end program drhook_papi_mpi diff --git a/tests/drhook/drhook_papi/drhook_papi_user_counters.F90 b/tests/drhook/drhook_papi/drhook_papi_user_counters.F90 new file mode 100644 index 00000000..00796d17 --- /dev/null +++ b/tests/drhook/drhook_papi/drhook_papi_user_counters.F90 @@ -0,0 +1,26 @@ +! (C) Copyright 2024- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +program drhook_papi_user_counters + + use yomhook, only : jphook, dr_hook + + implicit none + + real(jphook) :: zhook_handle + integer :: a + + call dr_hook('drhook_papi_user_counters', 0, zhook_handle) + + a = 1 + a = a + a + + call dr_hook('drhook_papi_user_counters', 1, zhook_handle) + +end program drhook_papi_user_counters diff --git a/tests/drhook/drhook_papi/drhook_papi_user_counters_more_than_max.F90 b/tests/drhook/drhook_papi/drhook_papi_user_counters_more_than_max.F90 new file mode 100644 index 00000000..ffc8b386 --- /dev/null +++ b/tests/drhook/drhook_papi/drhook_papi_user_counters_more_than_max.F90 @@ -0,0 +1,26 @@ +! (C) Copyright 2024- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +program drhook_papi_user_counters_more_than_max + + use yomhook, only : jphook, dr_hook + + implicit none + + real(jphook) :: zhook_handle + integer :: a + + call dr_hook('drhook_papi_user_counters_more_than_max', 0, zhook_handle) + + a = 1 + a = a + a + + call dr_hook('drhook_papi_user_counters_more_than_max', 1, zhook_handle) + +end program drhook_papi_user_counters_more_than_max diff --git a/tests/drhook/drhook_papi/drhook_papi_user_filename.F90 b/tests/drhook/drhook_papi/drhook_papi_user_filename.F90 new file mode 100644 index 00000000..4ca4c3e0 --- /dev/null +++ b/tests/drhook/drhook_papi/drhook_papi_user_filename.F90 @@ -0,0 +1,26 @@ +! (C) Copyright 2024- ECMWF. +! +! This software is licensed under the terms of the Apache Licence Version 2.0 +! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +! +! In applying this licence, ECMWF does not waive the privileges and immunities +! granted to it by virtue of its status as an intergovernmental organisation +! nor does it submit to any jurisdiction. + +program drhook_papi_user_filename + + use yomhook, only : jphook, dr_hook + + implicit none + + real(jphook) :: zhook_handle + integer :: a + + call dr_hook('drhook_papi_user_filename', 0, zhook_handle) + + a = 1 + a = a + a + + call dr_hook('drhook_papi_user_filename', 1, zhook_handle) + +end program drhook_papi_user_filename diff --git a/tests/test_drhook_counters.F90 b/tests/test_drhook_counters.F90 new file mode 100644 index 00000000..d425e284 --- /dev/null +++ b/tests/test_drhook_counters.F90 @@ -0,0 +1,100 @@ +program fiat_test_drhook_counters + use oml_mod ,only : oml_max_threads + use mpl_module, only : mpl_init, mpl_end, mpl_nproc, mpl_myrank + use yomhook, only : LHOOK,DR_HOOK,JPHOOK,dr_hook_init,dr_hook_end + use test_drhook_counters_stream_mod, only : stream_combinations + use test_drhook_counters_gemm_mod, only : gemm_combinations + use ec_env_mod, only : ec_setenv + + implicit none + logical :: luse_mpi = .true. + logical :: lsmall_problem_size = .false. + integer :: myproc,nproc + integer :: verbosity = 0 + + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + + luse_mpi = detect_mpirun() + lsmall_problem_size = detect_FIAT_UNIT_TEST() + + if (luse_mpi) then + call mpl_init(ldinfo=(verbosity>=1)) + nproc = mpl_nproc() + myproc = mpl_myrank() + else + nproc = 1 + myproc = 1 + endif + + if (myproc.eq.1) write(6,*)'Starting Tasks=',nproc,'threads=',oml_max_threads() + + call ec_setenv("DR_HOOK", "1", overwrite=.true.) + call ec_setenv("DR_HOOK_OPT", "COUNTERS", overwrite=.true.) + + call dr_hook_init() + + IF (LHOOK) CALL DR_HOOK('MAIN',0,ZHOOK_HANDLE) + + if (myproc.eq.1) write(6,*) "================================================= BENCHMARK STREAM START" + if (lsmall_problem_size) then + call stream_combinations(int(1024*32,kind=8)) + else + call stream_combinations() + endif + if (myproc.eq.1) write(6,*) "================================================= BENCHMARK STREAM END" + + if (myproc.eq.1) write(6,*) "================================================= BENCHMARK GEMM START" + if (lsmall_problem_size) then + call gemm_combinations(int(250,kind=8)) + else + call gemm_combinations() + endif + write(6,*) "================================================= BENCHMARK GEMM END" + + IF (LHOOK) CALL DR_HOOK('MAIN',1,ZHOOK_HANDLE) + + call dr_hook_end() + + if (luse_mpi) then + call mpl_end(ldmeminfo=.false.) + endif + if (myproc.eq.1) write(6,*)'Completed' +contains + function detect_mpirun() result(lmpi_required) + logical :: lmpi_required + integer :: ilen + integer, parameter :: nvars = 5 + character(len=32), dimension(nvars) :: cmpirun_detect + character(len=4) :: clenv_dr_hook_assert_mpi_initialized + integer :: ivar + lmpi_required = .false. +#if defined(NOMPI) + return +#endif + ! Environment variables that are set when mpirun, srun, aprun, ... are used + cmpirun_detect(1) = 'OMPI_COMM_WORLD_SIZE' ! openmpi + cmpirun_detect(2) = 'ALPS_APP_PE' ! cray pe + cmpirun_detect(3) = 'PMI_SIZE' ! intel + cmpirun_detect(4) = 'SLURM_NTASKS' ! slurm + cmpirun_detect(5) = 'FIAT_USE_MPI' ! forced + + do ivar = 1, nvars + call get_environment_variable(name=trim(cmpirun_detect(ivar)), length=ilen) + if (ilen > 0) then + lmpi_required = .true. + exit ! break + endif + enddo +end function + +function detect_FIAT_UNIT_TEST() result(lunit_test) + logical :: lunit_test + integer :: ilen + lunit_test = .false. + call get_environment_variable(name='FIAT_UNIT_TEST', length=ilen) + if (ilen > 0) then + lunit_test = .true. + endif +end function + +end program diff --git a/tests/test_drhook_counters_gemm.F90 b/tests/test_drhook_counters_gemm.F90 new file mode 100644 index 00000000..bb9affb3 --- /dev/null +++ b/tests/test_drhook_counters_gemm.F90 @@ -0,0 +1,88 @@ +module test_drhook_counters_gemm_mod + use yomhook, only : lhook,dr_hook,jphook + implicit none + +contains + subroutine gemm_combinations(n_init) + implicit none + integer(kind=8), intent(in), optional :: n_init + integer(kind=8) :: n,i + real(kind=jphook) :: zhook_handle + n=1000 + if (present(n_init)) then + n = n_init + endif +#if defined(HAVE_BLAS) + if (lhook) call dr_hook('GEMM_ALL',0,zhook_handle) + do i=1,4 + call dgemm_driver(n) + call sgemm_driver(n) + n=n*2 + end do + if (lhook) call dr_hook('GEMM_ALL',1,zhook_handle) +#endif + end subroutine gemm_combinations + +#if defined(HAVE_BLAS) + subroutine dgemm_driver(nn) + implicit none + double precision, allocatable :: a(:,:),b(:,:),c(:,:) + double precision :: alpha,beta + integer :: m,k,n + integer :: i,j + integer*8 :: nn + real(kind=jphook) :: zhook_handle + character(len=25) :: tag + + write(tag,'(i20)')nn + tag="_n="//adjustl(tag) + m=nn + n=nn + k=nn + alpha=1.0 + beta=0.0 + + allocate(a(m,k), b(k,n), c(m,n)) + a=1.0 + b=2.0 + c=3.0 + if (lhook) call dr_hook('DGEMM'//TRIM(tag),0,zhook_handle) + call dgemm('n','n',m,n,k,alpha,a,m,b,k,beta,c,m) + if (lhook) call dr_hook('DGEMM'//TRIM(tag),1,zhook_handle) + + return + + end subroutine dgemm_driver + + subroutine sgemm_driver(nn) + implicit none + real*4, allocatable :: a(:,:),b(:,:),c(:,:) + real*4 :: alpha,beta + integer :: m,k,n + integer :: i,j + integer*8 :: nn + real(kind=jphook) :: zhook_handle + character(len=25) :: tag + + write(tag,'(i20)')nn + tag="_n="//adjustl(tag) + m=nn + n=nn + k=nn + alpha=1.0 + beta=0.0 + + allocate(a(m,k), b(k,n), c(m,n)) + a=1.0 + b=2.0 + c=3.0 + if (lhook) call dr_hook('SGEMM'//TRIM(tag),0,zhook_handle) + call sgemm('n','n',m,n,k,alpha,a,m,b,k,beta,c,m) + if (lhook) call dr_hook('SGEMM'//TRIM(tag),1,zhook_handle) + + return + + end subroutine sgemm_driver +#endif + +end module diff --git a/tests/test_drhook_counters_stream.F90 b/tests/test_drhook_counters_stream.F90 new file mode 100644 index 00000000..ee575b11 --- /dev/null +++ b/tests/test_drhook_counters_stream.F90 @@ -0,0 +1,464 @@ +MODULE test_drhook_counters_stream_mod + !======================================================================= + ! Program: STREAM + ! Programmer: John D. McCalpin + ! RCS Revision: $Id: stream.f,v 5.6 2005/10/04 00:20:48 mccalpin Exp mccalpin $ + !----------------------------------------------------------------------- + ! Copyright 1991-2003: John D. McCalpin + !----------------------------------------------------------------------- + ! License: + ! 1. You are free to use this program and/or to redistribute + ! this program. + ! 2. You are free to modify this program for your own use, + ! including commercial use, subject to the publication + ! restrictions in item 3. + ! 3. You are free to publish results obtained from running this + ! program, or from works that you derive from this program, + ! with the following limitations: + ! 3a. In order to be referred to as "STREAM benchmark results", + ! published results must be in conformance to the STREAM + ! Run Rules, (briefly reviewed below) published at + ! http://www.cs.virginia.edu/stream/ref.html + ! and incorporated herein by reference. + ! As the copyright holder, John McCalpin retains the + ! right to determine conformity with the Run Rules. + ! 3b. Results based on modified source code or on runs not in + ! accordance with the STREAM Run Rules must be clearly + ! labelled whenever they are published. Examples of + ! proper labelling include: + ! "tuned STREAM benchmark results" + ! "based on a variant of the STREAM benchmark code" + ! Other comparable, clear and reasonable labelling is + ! acceptable. + ! 3c. Submission of results to the STREAM benchmark web site + ! is encouraged, but not required. + ! 4. Use of this program or creation of derived works based on this + ! program constitutes acceptance of these licensing restrictions. + ! 5. Absolutely no warranty is expressed or implied. + !----------------------------------------------------------------------- + ! This program measures sustained memory transfer rates in MB/s for + ! simple computational kernels coded in FORTRAN. + ! + ! The intent is to demonstrate the extent to which ordinary user + ! code can exploit the main memory bandwidth of the system under + ! test. + use yomhook, only : lhook,dr_hook,jphook + +contains + subroutine stream_combinations(n_init) + implicit none + integer(kind=8), intent(in), optional :: n_init + integer(kind=8) :: n, ntimes, i + real(kind=jphook) :: zhook_handle + n=1024*1024 + if (present(n_init)) then + n = n_init + endif + ntimes=1024 + if (lhook) call dr_hook('STREAM',0,zhook_handle) + do i=1,3 + write(6,'(" =============================== CALL STREAM(",I0,",",I0,")")') n, ntimes + call stream(n,ntimes) + n=n*8 + ntimes=ntimes/8 + end do + if (lhook) call dr_hook('STREAM',1,zhook_handle) + + end subroutine stream_combinations + + SUBROUTINE stream(n,ntimes) +!$ USE omp_lib + INTEGER*8 n,offset,ndim + INTEGER*8 ntimes + PARAMETER (offset=0) + ! .. + ! .. Local Scalars .. + DOUBLE PRECISION scalar,t + INTEGER j,k,nbpw,quantum + ! .. + ! .. Local Arrays .. + DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4), & + times(4,ntimes) + INTEGER bytes(4) + CHARACTER label(4)*11 + ! .. + ! .. External Functions .. + DOUBLE PRECISION timef + REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + REAL(KIND=JPHOOK) :: ZHOOK_1,ZHOOK_2,ZHOOK_3,ZHOOK_4 + CHARACTER(len=29) :: tag + +! INTEGER realsize + EXTERNAL mysecond !,checktick !,realsize + ! .. + ! .. Intrinsic Functions .. + ! + INTRINSIC dble,max,min,nint,sqrt + ! .. + ! .. Arrays in Common .. + DOUBLE PRECISION, allocatable :: a(:),b(:),c(:) + !dir$ attributes align:64 :: A, B, C +! CHARACTER(len=40) :: suffix + ! .. + ! .. Common blocks .. + ! COMMON a,b,c + ! .. + ! .. Data statements .. + DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/ + DATA label/'Copy: ','Scale: ','Add: ','Triad: '/ + DATA bytes/2,2,3,3/ + ! .. +! WRITE(suffix,'(A,I30)')"_",n + ! --- SETUP --- determine precision and check timing --- + ndim=n+offset + allocate(a(ndim),b(ndim),c(ndim)) + nbpw = realsize() + write(tag,'(I20)')n +!$ if (omp_in_parallel()) then +!$ tag="_par_n="//adjustl(tag) +!$ else + tag="_n="//adjustl(tag) +!$ end if + + PRINT *,'----------------------------------------------' + PRINT *,'STREAM Version $Revision: 5.6 $' + PRINT *,'----------------------------------------------' + WRITE (*,FMT=9010) 'Array size = ',n + WRITE (*,FMT=9010) 'Offset = ',offset + WRITE (*,FMT=9020) 'The total memory requirement is ', & + 3*nbpw*n/ (1024*1024),' MB' + WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times' + WRITE (*,FMT=9030) '--' + WRITE (*,FMT=9030) 'The *best* time for each test is used' + WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations' +!$OMP PARALLEL +!$OMP MASTER + PRINT *,'----------------------------------------------' +!$ PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS() +!$OMP END MASTER +!$OMP END PARALLEL + + PRINT *,'----------------------------------------------' + +!$OMP PARALLEL DO + DO 10 j = 1,n + a(j) = 2.0d0 + b(j) = 0.5D0 + c(j) = 0.0D0 +10 END DO + t = timef() +!$OMP PARALLEL DO + DO 20 j = 1,n + a(j) = 0.5d0*a(j) +20 END DO + t = timef() - t + PRINT *,'----------------------------------------------------' + quantum = checktick() + WRITE (*,FMT=9000) & + 'Your clock granularity/precision appears to be ',quantum, & + ' microseconds' + PRINT *,'----------------------------------------------------' + + ! --- MAIN LOOP --- repeat test cases NTIMES times --- + scalar = 0.5d0*a(1) + DO 70 k = 1,ntimes + + IF (LHOOK) CALL DR_HOOK('STREAM_COPY'//TRIM(tag),0,ZHOOK_1) + t = timef() + a(1) = a(1) + t +!$OMP PARALLEL DO + DO 30 j = 1,n + c(j) = a(j) +30 END DO + t = timef() - t + IF (LHOOK) CALL DR_HOOK('STREAM_COPY'//TRIM(tag),1,ZHOOK_1) + + c(n) = c(n) + t + times(1,k) = t + + IF (LHOOK) CALL DR_HOOK('STREAM_SCALE'//TRIM(tag),0,ZHOOK_2) + t = timef() + c(1) = c(1) + t +!$OMP PARALLEL DO + DO 40 j = 1,n + b(j) = scalar*c(j) +40 END DO + t = timef() - t + IF (LHOOK) CALL DR_HOOK('STREAM_SCALE'//TRIM(tag),1,ZHOOK_2) + + b(n) = b(n) + t + times(2,k) = t + + IF (LHOOK) CALL DR_HOOK('STREAM_ADD'//TRIM(tag),0,ZHOOK_3) + t = timef() + a(1) = a(1) + t +!$OMP PARALLEL DO + DO 50 j = 1,n + c(j) = a(j) + b(j) +50 END DO + t = timef() - t + IF (LHOOK) CALL DR_HOOK('STREAM_ADD'//TRIM(tag),1,ZHOOK_3) + c(n) = c(n) + t + times(3,k) = t + + IF (LHOOK) CALL DR_HOOK('STREAM_TRIAD'//TRIM(tag),0,ZHOOK_4) + t = timef() + b(1) = b(1) + t +!$OMP PARALLEL DO + DO 60 j = 1,n + a(j) = b(j) + scalar*c(j) +60 END DO + t = timef() - t + IF (LHOOK) CALL DR_HOOK('STREAM_TRIAD'//TRIM(tag),1,ZHOOK_4) + + a(n) = a(n) + t + times(4,k) = t +70 END DO + + ! --- SUMMARY --- + DO 90 k = 2,ntimes + DO 80 j = 1,4 + avgtime(j) = avgtime(j) + times(j,k) + mintime(j) = min(mintime(j),times(j,k)) + maxtime(j) = max(maxtime(j),times(j,k)) +80 END DO +90 END DO + WRITE (*,FMT=9040) + DO 100 j = 1,4 + avgtime(j) = avgtime(j)/dble(ntimes-1) + WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6, & + avgtime(j),mintime(j),maxtime(j) +100 END DO + PRINT *,'----------------------------------------------------' + CALL checksums (a,b,c,n,ntimes) + PRINT *,'----------------------------------------------------' + +9000 FORMAT (1x,a,i6,a) +9010 FORMAT (1x,a,i10) +9020 FORMAT (1x,a,i7,a) +9030 FORMAT (1x,a,i5,a,a) +9040 FORMAT ('Function',5x,'Rate (MB/s) Avg time Min time Max time' & + ) +9050 FORMAT (a,4 (f12.4,2x)) + END SUBROUTINE stream + + !------------------------------------- + ! INTEGER FUNCTION dblesize() + ! + ! A semi-portable way to determine the precision of DOUBLE PRECISION + ! in Fortran. + ! Here used to guess how many bytes of storage a DOUBLE PRECISION + ! number occupies. + ! + INTEGER FUNCTION realsize() + ! IMPLICIT NONE + + ! .. Local Scalars .. + DOUBLE PRECISION result,test + INTEGER j,ndigits + ! .. + ! .. Local Arrays .. + DOUBLE PRECISION ref(30) + ! .. + ! .. External Subroutines .. +! EXTERNAL confuse + ! .. + ! .. Intrinsic Functions .. + INTRINSIC abs,acos,log10,sqrt + ! .. + + ! Test #1 - compare single(1.0d0+delta) to 1.0d0 + +10 DO 20 j = 1,30 + ref(j) = 1.0d0 + 10.0d0** (-j) +20 END DO + + DO 30 j = 1,30 + test = ref(j) + ndigits = j + CALL confuse(test,result) + IF (test.EQ.1.0D0) THEN + GO TO 40 + END IF +30 END DO + GO TO 50 + +40 WRITE (*,FMT='(a)') & + '----------------------------------------------' + WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ', & + ndigits,' digits of accuracy' + IF (ndigits.LE.8) THEN + realsize = 4 + ELSE + realsize = 8 + END IF + WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize, & + ' bytes per DOUBLE PRECISION word' + WRITE (*,FMT='(a)') & + '----------------------------------------------' + RETURN + +50 PRINT *,'Hmmmm. I am unable to determine the size.' + PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION', & + ' number : ' + READ (*,FMT=*) realsize + IF (realsize.NE.4 .AND. realsize.NE.8) THEN + PRINT *,'Your answer ',realsize,' does not make sense.' + PRINT *,'Try again.' + PRINT *,'Please enter the number of Bytes per ', & + 'DOUBLE PRECISION number : ' + READ (*,FMT=*) realsize + END IF + PRINT *,'You have manually entered a size of ',realsize, & + ' bytes per DOUBLE PRECISION number' + WRITE (*,FMT='(a)') & + '----------------------------------------------' + END FUNCTION realsize + + SUBROUTINE confuse(q,r) + ! IMPLICIT NONE + ! .. Scalar Arguments .. + DOUBLE PRECISION q,r + ! .. + ! .. Intrinsic Functions .. + INTRINSIC cos + ! .. + r = cos(q) + RETURN +END SUBROUTINE confuse + +! A semi-portable way to determine the clock granularity +! Adapted from a code by John Henning of Digital Equipment Corporation +! +INTEGER FUNCTION checktick() + ! IMPLICIT NONE + + ! .. Parameters .. + INTEGER n + PARAMETER (n=20) + ! .. + ! .. Local Scalars .. + DOUBLE PRECISION t1,t2 + INTEGER i,j,jmin + ! .. + ! .. Local Arrays .. + DOUBLE PRECISION timesfound(n) + ! .. + ! .. External Functions .. + DOUBLE PRECISION timef + EXTERNAL timef + ! .. + ! .. Intrinsic Functions .. + INTRINSIC max,min,nint + ! .. + i = 0 + t1=-1 +10 t2 = timef() + IF (t2.EQ.t1) GO TO 10 + + t1 = t2 + i = i + 1 + timesfound(i) = t1 + IF (i.LT.n) GO TO 10 + + jmin = 1000000 + DO 20 i = 2,n + j = nint((timesfound(i)-timesfound(i-1))*1d6) + jmin = min(jmin,max(j,0)) +20 END DO + + IF (jmin.GT.0) THEN + checktick = jmin + ELSE + PRINT *,'Your clock granularity appears to be less ', & + 'than one microsecond' + checktick = 1 + END IF + RETURN + + ! PRINT 14, timesfound(1)*1d6 + ! DO 20 i=2,n + ! PRINT 14, timesfound(i)*1d6, + ! & nint((timesfound(i)-timesfound(i-1))*1d6) + ! 14 FORMAT (1X, F18.4, 1X, i8) + ! 20 END DO + +END FUNCTION checktick + + + + +SUBROUTINE checksums(a,b,c,n,ntimes) + ! IMPLICIT NONE + ! .. + ! .. Arguments .. + DOUBLE PRECISION a(*),b(*),c(*) + INTEGER*8 n,ntimes + ! .. + ! .. Local Scalars .. + DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon + INTEGER k + ! .. + + ! Repeat the main loop, but with scalars only. + ! This is done to check the sum & make sure all + ! iterations have been executed correctly. + + aa = 2.0D0 + bb = 0.5D0 + cc = 0.0D0 + aa = 0.5D0*aa + scalar = 0.5d0*aa + DO k = 1,ntimes + cc = aa + bb = scalar*cc + cc = aa + bb + aa = bb + scalar*cc + END DO + aa = aa*DBLE(n-2) + bb = bb*DBLE(n-2) + cc = cc*DBLE(n-2) + + ! Now sum up the arrays, excluding the first and last + ! elements, which are modified using the timing results + ! to confuse aggressive optimizers. + + suma = 0.0d0 + sumb = 0.0d0 + sumc = 0.0d0 + !$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc) + DO 110 j = 2,n-1 + suma = suma + a(j) + sumb = sumb + b(j) + sumc = sumc + c(j) +110 END DO + + epsilon = 1.D-6 + + IF (ABS(suma-aa)/suma .GT. epsilon) THEN + PRINT *,'Failed Validation on array a()' + PRINT *,'Target Sum of a is = ',aa + PRINT *,'Computed Sum of a is = ',suma + ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN + PRINT *,'Failed Validation on array b()' + PRINT *,'Target Sum of b is = ',bb + PRINT *,'Computed Sum of b is = ',sumb + ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN + PRINT *,'Failed Validation on array c()' + PRINT *,'Target Sum of c is = ',cc + PRINT *,'Computed Sum of c is = ',sumc + ELSE + PRINT *,'Solution Validates!' + ENDIF + +END SUBROUTINE checksums + +function itoa(i) result(res) + character(:),allocatable :: res + integer,intent(in) :: i + character(range(i)+2) :: tmp + write(tmp,'(i0)') i + res = trim(tmp) +end function itoa + +END MODULE