Skip to content

Commit

Permalink
Merge DusanJovic-NOAA/rt_squeue_state
Browse files Browse the repository at this point in the history
Improve error checking in rt.sh ufs-community#2388
  • Loading branch information
NickSzapiro-NOAA authored Aug 21, 2024
2 parents 0e49a81 + d38f4f5 commit 87535af
Show file tree
Hide file tree
Showing 8 changed files with 193 additions and 198 deletions.
17 changes: 5 additions & 12 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
#!/bin/bash
set -eu
uname_s=$(uname -s)
if [[ ${uname_s} == Darwin ]]; then
UFS_MODEL_DIR=$(greadlink -f -n "${BASH_SOURCE[0]}")
UFS_MODEL_DIR=$(dirname "${UFS_MODEL_DIR}")
UFS_MODEL_DIR=$(cd "${UFS_MODEL_DIR}" && pwd -P)
else
UFS_MODEL_DIR=$(readlink -f -n "${BASH_SOURCE[0]}")
UFS_MODEL_DIR=$(dirname "${UFS_MODEL_DIR}")
UFS_MODEL_DIR=$(cd "${UFS_MODEL_DIR}" && pwd -P)
fi
echo "UFS MODEL DIR: ${UFS_MODEL_DIR}"

SCRIPT_REALPATH=$(realpath "${BASH_SOURCE[0]}")
UFS_MODEL_DIR=$(dirname "${SCRIPT_REALPATH}")
readonly UFS_MODEL_DIR
echo "UFS MODEL DIR: ${UFS_MODEL_DIR}"

export CC=${CC:-mpicc}
export CXX=${CXX:-mpicxx}
Expand All @@ -26,4 +19,4 @@ for i in ${CMAKE_FLAGS}; do ARR_CMAKE_FLAGS+=("${i}") ; done
cmake "${UFS_MODEL_DIR}" "${ARR_CMAKE_FLAGS[@]}"
# Turn off OpenMP threading for parallel builds
# to avoid exhausting the number of user processes
OMP_NUM_THREADS=1 make -j "${BUILD_JOBS:-4}" "VERBOSE=${BUILD_VERBOSE:-}"
OMP_NUM_THREADS=1 make -j "${BUILD_JOBS:-4}" "VERBOSE=${BUILD_VERBOSE:-}"
10 changes: 2 additions & 8 deletions tests/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,8 @@ function trim {

SECONDS=0

uname_s=$(uname -s)
if [[ ${uname_s} == Darwin ]]; then
greadlnk=$(greadlink -f -n "${BASH_SOURCE[0]}" )
MYDIR=$(cd "$(dirname "${greadlnk}" )" && pwd -P)
else
readlnk=$(readlink -f -n "${BASH_SOURCE[0]}" )
MYDIR=$(cd "$(dirname "${readlnk}" )" && pwd -P)
fi
SCRIPT_REALPATH=$(realpath "${BASH_SOURCE[0]}")
MYDIR=$(dirname "${SCRIPT_REALPATH}")
readonly MYDIR

# ----------------------------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions tests/error-test.conf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ COMPILE | atm_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16,FV3_GFS_v16_fl
# This should succeed
RUN | control_c48.v2.sfc | | baseline |

# This should fail due to wall clock timeout
RUN | control_c48.v2.sfc_timeout | | baseline |

# These tests should always fail, and prevent the workflow from completing.
RUN | fail_to_copy | | baseline |
RUN | fail_to_run | | baseline |
Expand Down
1 change: 1 addition & 0 deletions tests/rt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,7 @@ if [[ ${skip_check_results} == true ]]; then
else
REGRESSIONTEST_LOG=${PATHRT}/logs/RegressionTests_${MACHINE_ID}.log
fi
rm -f "${REGRESSIONTEST_LOG}"

TEST_START_TIME="$(date '+%Y%m%d %T')"
export TEST_START_TIME
Expand Down
201 changes: 40 additions & 161 deletions tests/rt_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,6 @@ submit_and_wait() {

local -r job_card=$1

ROCOTO=${ROCOTO:-false}
ECFLOW=${ECFLOW:-false}

local test_status='PASS'
case ${SCHEDULER} in
pbs)
qsubout=$( qsub "${job_card}" )
Expand Down Expand Up @@ -187,26 +183,38 @@ submit_and_wait() {
set +e
job_info=$( qstat "${jobid}" )
set -e
if grep -q "${jobid}" <<< "${job_info}"; then
job_running=true
# Getting the status letter from scheduler info
status=$( grep "${jobid}" <<< "${job_info}" )
status=$( awk '{print $5}' <<< "${status}" )
else
job_running=false
status='COMPLETED'
set +e
exit_status=$( qstat "${jobid}" -x -f | grep Exit_status | awk '{print $3}')
set -e
if [[ ${exit_status} != 0 ]]; then
status='FAILED'
fi
fi
;;
slurm)
job_info=$( squeue -u "${USER}" -j "${jobid}" )
job_info=$( squeue -u "${USER}" -j "${jobid}" -o '%i %T' )
if grep -q "${jobid}" <<< "${job_info}"; then
job_running=true
else
job_running=false
job_info=$( sacct -n -j "${jobid}" --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep "${JBNME}" )
fi
# Getting the status letter from scheduler info
status=$( grep "${jobid}" <<< "${job_info}" )
status=$( awk '{print $2}' <<< "${status}" )
;;
*)
;;
esac


if grep -q "${jobid}" <<< "${job_info}"; then
job_running=true
else
job_running=false
continue
fi

# Getting the status letter from scheduler info
status=$( grep "${jobid}" <<< "${job_info}" )
status=$( awk '{print $5}' <<< "${status}" )

case ${status} in
#waiting cases
#pbs: Q
Expand All @@ -217,7 +225,7 @@ submit_and_wait() {
#running cases
#pbs: R
#slurm: (old: R, new: RUNNING)
R|RUNNING)
R|RUNNING|COMPLETING)
status_label='Job running'
;;
#held cases
Expand All @@ -229,14 +237,15 @@ submit_and_wait() {
#fail/completed cases
#slurm: F/FAILED TO/TIMEOUT CA/CANCELLED
F|TO|CA|FAILED|TIMEOUT|CANCELLED)
echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!!"
echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!! status=${status}"
job_running=false #Trip the loop to end with these status flags
interrupt_job
exit 1
;;
#completed
#pbs only: C-Complete E-Exiting
C|E)
#pbs: C-Complete E-Exiting
#slurm: CD/COMPLETED
C|E|CD|COMPLETED)
status_label='Completed'
;;
*)
Expand All @@ -253,140 +262,6 @@ submit_and_wait() {
done
}

check_results() {
echo "rt_utils.sh: Checking results of the regression test: ${TEST_ID}"

ROCOTO=${ROCOTO:-false}
ECFLOW=${ECFLOW:-false}

local test_status='PASS'

# Give one minute for data to show up on file system
#sleep 60

{
echo
echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}"
echo "working dir = ${RUNDIR}"
echo "Checking test ${TEST_ID} results ...."
} > "${RT_LOG}"
echo
echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}"
echo "working dir = ${RUNDIR}"
echo "Checking test ${TEST_ID} results ...."

if [[ ${CREATE_BASELINE} = false ]]; then
#
# --- regression test comparison
#
for i in ${LIST_FILES} ; do
printf %s " Comparing ${i} ....." >> "${RT_LOG}"
printf %s " Comparing ${i} ....."

if [[ ! -f ${RUNDIR}/${i} ]] ; then

echo ".......MISSING file" >> "${RT_LOG}"
echo ".......MISSING file"
test_status='FAIL'

elif [[ ! -f ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i} ]] ; then

echo ".......MISSING baseline" >> "${RT_LOG}"
echo ".......MISSING baseline"
test_status='FAIL'

else
if [[ ${i##*.} == nc* ]] ; then
if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then
printf "USING NCCMP.." >> "${RT_LOG}"
printf "USING NCCMP.."
if [[ ${CMP_DATAONLY} == false ]]; then
nccmp -d -S -q -f -g -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$?
else
nccmp -d -S -q -f -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$?
fi
if [[ ${d} -ne 0 && ${d} -ne 1 ]]; then
printf "....ERROR" >> "${RT_LOG}"
printf "....ERROR"
test_status='FAIL'
fi
fi
else
printf "USING CMP.." >> "${RT_LOG}"
printf "USING CMP.."
cmp "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" >/dev/null 2>&1 && d=$? || d=$?
if [[ ${d} -eq 2 ]]; then
printf "....ERROR" >> "${RT_LOG}"
printf "....ERROR"
test_status='FAIL'
fi

fi

if [[ ${d} -ne 0 ]]; then
echo "....NOT IDENTICAL" >> "${RT_LOG}"
echo "....NOT IDENTICAL"
test_status='FAIL'
else
echo "....OK" >> "${RT_LOG}"
echo "....OK"
fi

fi

done

else
#
# --- create baselines
#
echo;echo "Moving baseline ${TEST_ID} files ...."
echo;echo "Moving baseline ${TEST_ID} files ...." >> "${RT_LOG}"

for i in ${LIST_FILES} ; do
printf %s " Moving ${i} ....."
printf %s " Moving ${i} ....." >> "${RT_LOG}"
if [[ -f ${RUNDIR}/${i} ]] ; then
mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")"
cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}"
echo "....OK" >> "${RT_LOG}"
echo "....OK"
else
echo "....NOT OK. Missing ${RUNDIR}/${i}" >> "${RT_LOG}"
echo "....NOT OK. Missing ${RUNDIR}/${i}"
test_status='FAIL'
fi
done

fi

{
echo
grep "The total amount of wall time" "${RUNDIR}/out"
grep "The maximum resident set size" "${RUNDIR}/out"
echo
} >> "${RT_LOG}"

TRIES=''
if [[ ${ECFLOW} == true ]]; then
if [[ ${ECF_TRYNO} -gt 1 ]]; then
TRIES=" Tries: ${ECF_TRYNO}"
fi
fi
echo "Test ${TEST_ID} ${test_status}${TRIES}" >> "${RT_LOG}"
echo >> "${RT_LOG}"
echo "Test ${TEST_ID} ${test_status}${TRIES}"
echo

if [[ ${test_status} = 'FAIL' ]]; then
echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}"
return 1
else
return 0
fi
}


kill_job() {
echo "rt_utils.sh: Killing job: ${jobid} on ${SCHEDULER}..."
[[ -z $1 ]] && exit 1
Expand Down Expand Up @@ -580,14 +455,16 @@ ecflow_create_compile_task() {

cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/compile_${COMPILE_ID}.ecf"
%include <head.h>
${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log" 2>&1 &
(
cd "${LOG_DIR}"
ln -sf "compile_${COMPILE_ID}.log.\${ECF_TRYNO}" "compile_${COMPILE_ID}.log"
)
${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log.\${ECF_TRYNO}" 2>&1 &
%include <tail.h>
EOF
{
echo " task compile_${COMPILE_ID}"
echo " label build_options '${MAKE_OPT}'"
echo " label job_id ''"
echo " label job_status ''"
echo " inlimit max_builds"
} >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def"
}
Expand All @@ -596,13 +473,15 @@ ecflow_create_run_task() {
echo "rt_utils.sh: ${TEST_ID}: Creating ECFLOW run task"
cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/${TEST_ID}${RT_SUFFIX}.ecf"
%include <head.h>
${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log" 2>&1 &
(
cd "${LOG_DIR}"
ln -sf "run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log"
)
${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" 2>&1 &
%include <tail.h>
EOF
{
echo " task ${TEST_ID}${RT_SUFFIX}"
echo " label job_id ''"
echo " label job_status ''"
echo " inlimit max_jobs"
} >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def"
if [[ ${DEP_RUN} != '' ]]; then
Expand Down
11 changes: 10 additions & 1 deletion tests/run_compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,16 @@ cleanup() {

write_fail_test() {
echo "${JBNME} failed in run_compile" >> "${PATHRT}/fail_${JBNME}"
exit 1
if [[ ${ROCOTO:-false} == true ]] || [[ ${ECFLOW:-false} == true ]]; then
# if this script has been submitted by a workflow return non-zero exit status
# so that workflow can resubmit it
exit 1
else
# if this script has been executed interactively, return zero exit status
# so that rt.sh can continue running, and hope that rt.sh's generate_log
# will catch failed tests
exit 0
fi
}

remove_fail_test() {
Expand Down
Loading

0 comments on commit 87535af

Please sign in to comment.