From 103f2c45094e238c44931c4e7340f27c2e97cac5 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 18 Jun 2024 23:05:20 +0000 Subject: [PATCH 01/49] compiled OK now --- env/AWSPW.env | 6 +-- modulefiles/module_base.noaacloud.lua | 51 ++++++++++++++++++++ modulefiles/module_gwci.noaacloud.lua | 15 ++++++ modulefiles/module_gwsetup.noaacloud.lua | 20 ++++++++ parm/config/gfs/config.resources | 2 +- sorc/build_all.sh | 2 +- sorc/build_ufs.sh | 29 ++---------- sorc/link_workflow.sh | 39 +++++----------- sorc/ufs_model.fd | 2 +- ush/load_fv3gfs_modules.sh | 2 +- ush/module-setup.sh | 6 +-- versions/build.noaacloud.ver | 5 ++ versions/run.noaacloud.ver | 10 ++++ workflow/hosts.py | 5 +- workflow/hosts/awspw.yaml | 10 ++-- workflow/rocoto/gfs_tasks.py | 59 +++++++++++------------- workflow/setup_expt.py | 2 - 17 files changed, 162 insertions(+), 103 deletions(-) create mode 100644 modulefiles/module_base.noaacloud.lua create mode 100644 modulefiles/module_gwci.noaacloud.lua create mode 100644 modulefiles/module_gwsetup.noaacloud.lua create mode 100644 versions/build.noaacloud.ver create mode 100644 versions/run.noaacloud.ver diff --git a/env/AWSPW.env b/env/AWSPW.env index 7d81000f5c..ac949710db 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -14,8 +14,8 @@ fi step=$1 -export launcher="mpiexec.hydra" -export mpmd_opt="" +export launcher="srun --mpi=pmi2 -l" +export mpmd_opt="--distribution=block:block --hint=nomultithread --cpus-per-task=1" # Configure MPI environment export OMP_STACKSIZE=2048000 @@ -36,7 +36,7 @@ if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then (( nnodes = (${!nprocs}+${!ppn}-1)/${!ppn} )) (( ntasks = nnodes*${!ppn} )) # With ESMF threading, the model wants to use the full node - export APRUN_UFS="${launcher} -n ${ntasks}" + export APRUN_UFS="${launcher} -n ${ntasks} ${mpmd_opt}" unset nprocs ppn nnodes ntasks elif [[ "${step}" = "post" ]]; then diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua new file mode 100644 index 0000000000..fb5b283087 --- /dev/null +++ b/modulefiles/module_base.noaacloud.lua @@ -0,0 +1,51 @@ +help([[ +Load environment to run GFS on noaacloud +]]) + +local spack_mod_path=(os.getenv("spack_mod_path") or "None") +prepend_path("MODULEPATH", spack_mod_path) + +load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None"))) +load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None"))) +load(pathJoin("python", (os.getenv("python_ver") or "None"))) + +--load(pathJoin("hpss", (os.getenv("hpss_ver") or "None"))) +load(pathJoin("gempak", (os.getenv("gempak_ver") or "None"))) +load(pathJoin("ncl", (os.getenv("ncl_ver") or "None"))) +load(pathJoin("jasper", (os.getenv("jasper_ver") or "None"))) +load(pathJoin("libpng", (os.getenv("libpng_ver") or "None"))) +load(pathJoin("cdo", (os.getenv("cdo_ver") or "None"))) +--load(pathJoin("R", (os.getenv("R_ver") or "None"))) + +load(pathJoin("hdf5", (os.getenv("hdf5_ver") or "None"))) +load(pathJoin("netcdf-c", (os.getenv("netcdf_c_ver") or "None"))) +load(pathJoin("netcdf-fortran", (os.getenv("netcdf_fortran_ver") or "None"))) + +load(pathJoin("nco", (os.getenv("nco_ver") or "None"))) +load(pathJoin("prod_util", (os.getenv("prod_util_ver") or "None"))) +load(pathJoin("grib-util", (os.getenv("grib_util_ver") or "None"))) +load(pathJoin("g2tmpl", (os.getenv("g2tmpl_ver") or "None"))) +load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None"))) +load(pathJoin("crtm", (os.getenv("crtm_ver") or "None"))) +load(pathJoin("bufr", (os.getenv("bufr_ver") or "None"))) +load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None"))) +load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None"))) +load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None"))) +load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None"))) +load(pathJoin("py-pandas", (os.getenv("py_pandas_ver") or "None"))) +load(pathJoin("py-python-dateutil", (os.getenv("py_python_dateutil_ver") or "None"))) +--load(pathJoin("met", (os.getenv("met_ver") or "None"))) +--load(pathJoin("metplus", (os.getenv("metplus_ver") or "None"))) +load(pathJoin("py-xarray", (os.getenv("py_xarray_ver") or "None"))) + +setenv("WGRIB2","wgrib2") +setenv("UTILROOT",(os.getenv("prod_util_ROOT") or "None")) + +--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) +--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/prepobs/feature-GFSv17_com_reorg_log_update/modulefiles")) +--load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None"))) + +--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/Fit2Obs/v" .. (os.getenv("fit2obs_ver") or "None"), "modulefiles")) +--load(pathJoin("fit2obs", (os.getenv("fit2obs_ver") or "None"))) + +whatis("Description: GFS run environment") diff --git a/modulefiles/module_gwci.noaacloud.lua b/modulefiles/module_gwci.noaacloud.lua new file mode 100644 index 0000000000..c3142cd60d --- /dev/null +++ b/modulefiles/module_gwci.noaacloud.lua @@ -0,0 +1,15 @@ +help([[ +Load environment to run GFS workflow setup scripts on noaacloud +]]) + +prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") + +load(pathJoin("stack-intel", os.getenv("2021.3.0"))) +load(pathJoin("stack-intel-oneapi-mpi", os.getenv("2021.3.0"))) + +load(pathJoin("netcdf-c", os.getenv("4.9.2"))) +load(pathJoin("netcdf-fortran", os.getenv("4.6.1"))) +load(pathJoin("nccmp","1.9.0.1")) +load(pathJoin("wgrib2", "2.0.8")) + +whatis("Description: GFS run setup CI environment") diff --git a/modulefiles/module_gwsetup.noaacloud.lua b/modulefiles/module_gwsetup.noaacloud.lua new file mode 100644 index 0000000000..f3845e8d72 --- /dev/null +++ b/modulefiles/module_gwsetup.noaacloud.lua @@ -0,0 +1,20 @@ +help([[ +Load environment to run GFS workflow setup scripts on noaacloud +]]) + +load(pathJoin("rocoto")) + +prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") + +local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.3.0" +local python_ver=os.getenv("python_ver") or "3.10.3" + +load(pathJoin("stack-intel", stack_intel_ver)) +load(pathJoin("python", python_ver)) +load("py-jinja2") +load("py-pyyaml") +load("py-numpy") +local git_ver=os.getenv("git_ver") or "1.8.3.1" +load(pathJoin("git", git_ver)) + +whatis("Description: GFS run setup environment") diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index e5f741cf7e..75b03f7156 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -59,7 +59,7 @@ case ${machine} in ;; "AWSPW") export PARTITION_BATCH="compute" - npe_node_max=40 + npe_node_max=24 ;; "CONTAINER") npe_node_max=1 diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 28f52fd306..b6c4e6cc1c 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -145,7 +145,7 @@ build_opts["ww3prepost"]="${_wave_opt} ${_verbose_opt} ${_build_ufs_opt} ${_buil # Optional DA builds if [[ "${_build_ufsda}" == "YES" ]]; then - if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" ]]; then + if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" && "${MACHINE_ID}" != "noaacloud" ]]; then echo "NOTE: The GDAS App is not supported on ${MACHINE_ID}. Disabling build." else build_jobs["gdas"]=8 diff --git a/sorc/build_ufs.sh b/sorc/build_ufs.sh index 7e84eaebc2..44c8c7a2ad 100755 --- a/sorc/build_ufs.sh +++ b/sorc/build_ufs.sh @@ -41,30 +41,9 @@ COMPILE_NR=0 CLEAN_BEFORE=YES CLEAN_AFTER=NO -if [[ "${MACHINE_ID}" != "noaacloud" ]]; then - BUILD_JOBS=${BUILD_JOBS:-8} ./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}" - mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x - mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua - cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua -else - - if [[ "${PW_CSP:-}" == "aws" ]]; then - set +x - # TODO: This will need to be addressed further when the EPIC stacks are available/supported. - module use /contrib/spack-stack/envs/ufswm/install/modulefiles/Core - module load stack-intel - module load stack-intel-oneapi-mpi - module load ufs-weather-model-env/1.0.0 - # TODO: It is still uncertain why this is the only module that is - # missing; check the spack build as this needed to be added manually. - module load w3emc/2.9.2 # TODO: This has similar issues for the EPIC stack. - module list - set -x - fi - - export CMAKE_FLAGS="${MAKE_OPT}" - BUILD_JOBS=${BUILD_JOBS:-8} ./build.sh - mv "${cwd}/ufs_model.fd/build/ufs_model" "${cwd}/ufs_model.fd/tests/ufs_model.x" -fi +BUILD_JOBS=${BUILD_JOBS:-8} ./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}" +mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x +mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua +cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua exit 0 diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index 4973ab8d7d..580bc0ce39 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -76,6 +76,7 @@ case "${machine}" in "jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;; "s4") FIX_DIR="/data/prod/glopara/fix" ;; "gaea") FIX_DIR="/gpfs/f5/epic/proj-shared/global/glopara/data/fix" ;; + "noaacloud") FIX_DIR="/contrib/Wei.Huang/data/hack-orion/fix" ;; *) echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR" exit 1 @@ -84,24 +85,20 @@ esac # Source fix version file source "${HOMEgfs}/versions/fix.ver" +# global-nest uses different versions of orog and ugwd +if [[ "${LINK_NEST:-OFF}" == "ON" ]] ; then + source "${HOMEgfs}/versions/fix.nest.ver" +fi # Link python pacakges in ush/python # TODO: This will be unnecessary when these are part of the virtualenv -packages=("wxflow") +packages=("wxflow" "jcb") for package in "${packages[@]}"; do cd "${HOMEgfs}/ush/python" || exit 1 [[ -s "${package}" ]] && rm -f "${package}" ${LINK} "${HOMEgfs}/sorc/${package}/src/${package}" . done -# Link GDASapp python packages in ush/python -packages=("jcb") -for package in "${packages[@]}"; do - cd "${HOMEgfs}/ush/python" || exit 1 - [[ -s "${package}" ]] && rm -f "${package}" - ${LINK} "${HOMEgfs}/sorc/gdas.cd/sorc/${package}/src/${package}" . -done - # Link wxflow in workflow and ci/scripts # TODO: This will be unnecessary when wxflow is part of the virtualenv cd "${HOMEgfs}/workflow" || exit 1 @@ -138,20 +135,7 @@ do fix_ver="${dir}_ver" ${LINK_OR_COPY} "${FIX_DIR}/${dir}/${!fix_ver}" "${dir}" done -# global-nest uses different versions of orog and ugwd -if [[ "${LINK_NEST:-OFF}" == "ON" ]] ; then - for dir in orog \ - ugwd - do - nestdir=${dir}_nest - if [[ -d "${nestdir}" ]]; then - [[ "${RUN_ENVIR}" == "nco" ]] && chmod -R 755 "${nestdir}" - rm -rf "${nestdir}" - fi - fix_ver="${dir}_nest_ver" - ${LINK_OR_COPY} "${FIX_DIR}/${dir}/${!fix_ver}" "${nestdir}" - done -fi + #--------------------------------------- #--add files from external repositories @@ -387,10 +371,11 @@ fi #--link source code directories #------------------------------ cd "${HOMEgfs}/sorc" || exit 8 -if [[ -d ufs_model.fd ]]; then - [[ -d upp.fd ]] && rm -rf upp.fd - ${LINK} ufs_model.fd/FV3/upp upp.fd -fi +# TODO: Commenting out until UPP is up-to-date with Rocky-8. +#if [[ -d ufs_model.fd ]]; then +# [[ -d upp.fd ]] && rm -rf upp.fd +# ${LINK} ufs_model.fd/FV3/upp upp.fd +#fi if [[ -d gsi_enkf.fd ]]; then [[ -d gsi.fd ]] && rm -rf gsi.fd diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index 485ccdfc4a..a183a52151 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit 485ccdfc4a7ed6deeb02d82c2cebe51b37e892f5 +Subproject commit a183a521516110cc9bcb86d853bd9b0dccef5bc7 diff --git a/ush/load_fv3gfs_modules.sh b/ush/load_fv3gfs_modules.sh index ae0e381db4..2cafc4fd81 100755 --- a/ush/load_fv3gfs_modules.sh +++ b/ush/load_fv3gfs_modules.sh @@ -20,7 +20,7 @@ source "${HOMEgfs}/versions/run.ver" module use "${HOMEgfs}/modulefiles" case "${MACHINE_ID}" in - "wcoss2" | "hera" | "orion" | "hercules" | "gaea" | "jet" | "s4") + "wcoss2" | "hera" | "orion" | "hercules" | "gaea" | "jet" | "s4" | "noaacloud") module load "module_base.${MACHINE_ID}" ;; *) diff --git a/ush/module-setup.sh b/ush/module-setup.sh index b4ec3edafa..398562652d 100755 --- a/ush/module-setup.sh +++ b/ush/module-setup.sh @@ -92,10 +92,8 @@ elif [[ ${MACHINE_ID} = discover* ]]; then # TODO: This can likely be made more general once other cloud # platforms come online. elif [[ ${MACHINE_ID} = "noaacloud" ]]; then - - export SPACK_ROOT=/contrib/global-workflow/spack-stack/spack - export PATH=${PATH}:${SPACK_ROOT}/bin - . "${SPACK_ROOT}"/share/spack/setup-env.sh + # We are on NOAA Cloud + module purge else echo WARNING: UNKNOWN PLATFORM 1>&2 diff --git a/versions/build.noaacloud.ver b/versions/build.noaacloud.ver new file mode 100644 index 0000000000..ba47313675 --- /dev/null +++ b/versions/build.noaacloud.ver @@ -0,0 +1,5 @@ +export stack_intel_ver=2021.3.0 +export stack_impi_ver=2021.3.0 +export spack_env=gsi-addon-env +source "${HOMEgfs:-}/versions/build.spack.ver" +export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver new file mode 100644 index 0000000000..c85d29a71a --- /dev/null +++ b/versions/run.noaacloud.ver @@ -0,0 +1,10 @@ +export stack_intel_ver=2021.3.0 +export stack_impi_ver=2021.3.0 +export spack_env=gsi-addon-env + +export gempak_ver=7.4.2 + +source "${HOMEgfs:-}/versions/run.spack.ver" +export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" + +export cdo_ver=2.2.0 diff --git a/workflow/hosts.py b/workflow/hosts.py index 2334a3ac35..5f4f51fd7f 100644 --- a/workflow/hosts.py +++ b/workflow/hosts.py @@ -15,7 +15,8 @@ class Host: """ SUPPORTED_HOSTS = ['HERA', 'ORION', 'JET', 'HERCULES', - 'WCOSS2', 'S4', 'CONTAINER', 'AWSPW', 'GAEA'] + 'WCOSS2', 'S4', 'CONTAINER', 'GAEA', + 'AWSPW'] def __init__(self, host=None): @@ -54,7 +55,7 @@ def detect(cls): elif container is not None: machine = 'CONTAINER' elif pw_csp is not None: - if pw_csp.lower() not in ['azure', 'aws', 'gcp']: + if pw_csp.lower() not in ['azure', 'aws', 'google']: raise ValueError( f'NOAA cloud service provider "{pw_csp}" is not supported.') machine = f"{pw_csp.upper()}PW" diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index d2223e799e..eb16344b28 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -3,11 +3,11 @@ DMPDIR: '/scratch1/NCEPDEV/global/glopara/dump' # TODO: This does not yet exist. PACKAGEROOT: '/scratch1/NCEPDEV/global/glopara/nwpara' #TODO: This does not yet exist. COMINsyn: '/scratch1/NCEPDEV/global/glopara/com/gfs/prod/syndat' #TODO: This does not yet exist. HOMEDIR: '/contrib/${USER}' -STMP: '/lustre/${USER}/stmp2/' -PTMP: '/lustre/${USER}/stmp4/' +STMP: '/lustre/${USER}/stmp/' +PTMP: '/lustre/${USER}/ptmp/' NOSCRUB: ${HOMEDIR} -ACCOUNT: hwufscpldcld -ACCOUNT_SERVICE: hwufscpldcld +ACCOUNT: ${USER} +ACCOUNT_SERVICE: ${USER} SCHEDULER: slurm QUEUE: batch QUEUE_SERVICE: batch @@ -16,7 +16,7 @@ PARTITION_SERVICE: compute RESERVATION: '' CHGRP_RSTPROD: 'YES' CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. -HPSSARCH: 'YES' +HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. LOCALARCH: 'NO' ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 2c74d0f854..95f27c1ba7 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -3,6 +3,7 @@ from wxflow import timedelta_to_HMS import rocoto.rocoto as rocoto import numpy as np +import os class GFSTasks(Tasks): @@ -24,7 +25,32 @@ def stage_ic(self): # Atm ICs if self.app_config.do_atm: - prefix = f"{cpl_ic['BASE_CPLIC']}/{cpl_ic['CPL_ATMIC']}/@Y@m@d@H/atmos" + pslot = self._base['PSLOT'] + if ( 'BASE_CPLIC' in cpl_ic.keys() ): + base_cplic = f"{cpl_ic['BASE_CPLIC']}" + else: + base_cplic = os.environ.get('BASE_CPLIC') + if ( 'CPL_ATMIC' in cpl_ic.keys() ): + cpl_atmic = f"{cpl_ic['CPL_ATMIC']}" + else: + cpl_atmic = os.environ.get('CPL_ATMIC') + + prefix = f"{base_cplic}/{cpl_atmic}/@Y@m@d@H/atmos" + + pw_csp = os.environ.get('PW_CSP') + use_ufs_utils_format = os.environ.get('USE_UFS_UTILS_FORMAT', False) + if ( pw_csp in ['aws', 'azure', 'google'] or use_ufs_utils_format): + icdir = f"{base_cplic}/{cpl_atmic}" + + if('IC_PREFIX' in cpl_ic.keys()): + cpl_ic_prefix = f"{icdir}/{cpl_ic['IC_PREFIX']}" + else: + cpl_ic_prefix = 'gfs' + if('IC_TYPE' in cpl_ic.keys()): + cpl_ic_type = f"{cpl_ic['IC_TYPE']}" + else: + cpl_ic_type = 'input' + prefix = f"{icdir}/{cpl_ic_prefix}.@Y@m@d/@H/model_data/atmos/{cpl_ic_type}" for file in ['gfs_ctrl.nc'] + \ [f'{datatype}_data.tile{tile}.nc' for datatype in ['gfs', 'sfc'] @@ -483,35 +509,10 @@ def atmanlfinal(self): return task - def prepobsaero(self): - deps = [] - dep_dict = {'type': 'task', 'name': f'{self.cdump}prep'} - deps.append(rocoto.add_dependency(dep_dict)) - dependencies = rocoto.create_dependency(dep_condition='and', dep=deps) - - resources = self.get_resource('prepobsaero') - task_name = f'{self.cdump}prepobsaero' - task_dict = {'task_name': task_name, - 'resources': resources, - 'dependency': dependencies, - 'envars': self.envars, - 'cycledef': self.cdump.replace('enkf', ''), - 'command': f'{self.HOMEgfs}/jobs/rocoto/prepobsaero.sh', - 'job_name': f'{self.pslot}_{task_name}_@H', - 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', - 'maxtries': '&MAXTRIES;' - } - - task = rocoto.create_task(task_dict) - - return task - def aeroanlinit(self): deps = [] dep_dict = {'type': 'task', 'name': f'{self.cdump}prep'} - if self.app_config.do_prep_obs_aero: - dep_dict = {'type': 'task', 'name': f'{self.cdump}prepobsaero'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep_condition='and', dep=deps) @@ -1200,13 +1201,9 @@ def wavepostbndpnt(self): return task def wavepostbndpntbll(self): - - # The wavepostbndpntbll job runs on forecast hours up to FHMAX_WAV_IBP - last_fhr = self._configs['wave']['FHMAX_WAV_IBP'] - deps = [] atmos_hist_path = self._template_to_rocoto_cycstring(self._base["COM_ATMOS_HISTORY_TMPL"]) - data = f'{atmos_hist_path}/{self.cdump}.t@Hz.atm.logf{last_fhr:03d}.txt' + data = f'{atmos_hist_path}/{self.cdump}.t@Hz.atm.logf180.txt' dep_dict = {'type': 'data', 'data': data} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps) diff --git a/workflow/setup_expt.py b/workflow/setup_expt.py index b44842b982..97d25dc15a 100755 --- a/workflow/setup_expt.py +++ b/workflow/setup_expt.py @@ -287,8 +287,6 @@ def _update_defaults(dict_in: dict) -> dict: data = AttrDict(host.info, **inputs.__dict__) data.HOMEgfs = _top yaml_path = inputs.yaml - if not os.path.exists(yaml_path): - raise IOError(f'YAML file does not exist, check path:' + yaml_path) yaml_dict = _update_defaults(AttrDict(parse_j2yaml(yaml_path, data))) # First update config.base From b0ac40679b1e445c75a565e3c4264a9a4102fb00 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 19 Jun 2024 16:58:27 +0000 Subject: [PATCH 02/49] re-test on aws with fewer changes --- parm/config/gfs/config.resources | 2 +- sorc/link_workflow.sh | 38 ++++++++++++++------ versions/run.noaacloud.ver | 1 + workflow/hosts.py | 3 +- workflow/hosts/awspw.yaml | 1 + workflow/rocoto/gefs_tasks.py | 4 ++- workflow/rocoto/gfs_tasks.py | 61 +++++++++++++++++--------------- workflow/rocoto/workflow_xml.py | 21 ++++++++--- workflow/setup_expt.py | 2 ++ 9 files changed, 85 insertions(+), 48 deletions(-) diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index 75b03f7156..f006b8e7f0 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -59,7 +59,7 @@ case ${machine} in ;; "AWSPW") export PARTITION_BATCH="compute" - npe_node_max=24 + npe_node_max=36 ;; "CONTAINER") npe_node_max=1 diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index 580bc0ce39..48eb626633 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -85,20 +85,24 @@ esac # Source fix version file source "${HOMEgfs}/versions/fix.ver" -# global-nest uses different versions of orog and ugwd -if [[ "${LINK_NEST:-OFF}" == "ON" ]] ; then - source "${HOMEgfs}/versions/fix.nest.ver" -fi # Link python pacakges in ush/python # TODO: This will be unnecessary when these are part of the virtualenv -packages=("wxflow" "jcb") +packages=("wxflow") for package in "${packages[@]}"; do cd "${HOMEgfs}/ush/python" || exit 1 [[ -s "${package}" ]] && rm -f "${package}" ${LINK} "${HOMEgfs}/sorc/${package}/src/${package}" . done +# Link GDASapp python packages in ush/python +packages=("jcb") +for package in "${packages[@]}"; do + cd "${HOMEgfs}/ush/python" || exit 1 + [[ -s "${package}" ]] && rm -f "${package}" + ${LINK} "${HOMEgfs}/sorc/gdas.cd/sorc/${package}/src/${package}" . +done + # Link wxflow in workflow and ci/scripts # TODO: This will be unnecessary when wxflow is part of the virtualenv cd "${HOMEgfs}/workflow" || exit 1 @@ -135,7 +139,20 @@ do fix_ver="${dir}_ver" ${LINK_OR_COPY} "${FIX_DIR}/${dir}/${!fix_ver}" "${dir}" done - +# global-nest uses different versions of orog and ugwd +if [[ "${LINK_NEST:-OFF}" == "ON" ]] ; then + for dir in orog \ + ugwd + do + nestdir=${dir}_nest + if [[ -d "${nestdir}" ]]; then + [[ "${RUN_ENVIR}" == "nco" ]] && chmod -R 755 "${nestdir}" + rm -rf "${nestdir}" + fi + fix_ver="${dir}_nest_ver" + ${LINK_OR_COPY} "${FIX_DIR}/${dir}/${!fix_ver}" "${nestdir}" + done +fi #--------------------------------------- #--add files from external repositories @@ -371,11 +388,10 @@ fi #--link source code directories #------------------------------ cd "${HOMEgfs}/sorc" || exit 8 -# TODO: Commenting out until UPP is up-to-date with Rocky-8. -#if [[ -d ufs_model.fd ]]; then -# [[ -d upp.fd ]] && rm -rf upp.fd -# ${LINK} ufs_model.fd/FV3/upp upp.fd -#fi +if [[ -d ufs_model.fd ]]; then + [[ -d upp.fd ]] && rm -rf upp.fd + ${LINK} ufs_model.fd/FV3/upp upp.fd +fi if [[ -d gsi_enkf.fd ]]; then [[ -d gsi.fd ]] && rm -rf gsi.fd diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver index c85d29a71a..7c23da0e7a 100644 --- a/versions/run.noaacloud.ver +++ b/versions/run.noaacloud.ver @@ -7,4 +7,5 @@ export gempak_ver=7.4.2 source "${HOMEgfs:-}/versions/run.spack.ver" export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" +export ncl_ver=6.6.2 export cdo_ver=2.2.0 diff --git a/workflow/hosts.py b/workflow/hosts.py index 5f4f51fd7f..6488bf12ea 100644 --- a/workflow/hosts.py +++ b/workflow/hosts.py @@ -15,8 +15,7 @@ class Host: """ SUPPORTED_HOSTS = ['HERA', 'ORION', 'JET', 'HERCULES', - 'WCOSS2', 'S4', 'CONTAINER', 'GAEA', - 'AWSPW'] + 'WCOSS2', 'S4', 'CONTAINER', 'AWSPW', 'GAEA'] def __init__(self, host=None): diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index eb16344b28..80f16fbc5b 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -18,6 +18,7 @@ CHGRP_RSTPROD: 'YES' CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. +BASE_CPLIC: '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs' LOCALARCH: 'NO' ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. MAKE_NSSTBUFR: 'NO' diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index 6fffc881e0..8fadafe9b0 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -2,7 +2,7 @@ from rocoto.tasks import Tasks import rocoto.rocoto as rocoto from datetime import datetime, timedelta - +import os class GEFSTasks(Tasks): @@ -11,6 +11,8 @@ def __init__(self, app_config: AppConfig, cdump: str) -> None: def stage_ic(self): cpl_ic = self._configs['stage_ic'] + if ('BASE_CPLIC' not in cpl_ic.keys()): + cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs') deps = [] dtg_prefix = "@Y@m@d.@H0000" offset = str(self._configs['base']['OFFSET_START_HOUR']).zfill(2) + ":00:00" diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 95f27c1ba7..7c3ffb0cc9 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -5,7 +5,6 @@ import numpy as np import os - class GFSTasks(Tasks): def __init__(self, app_config: AppConfig, cdump: str) -> None: @@ -25,32 +24,9 @@ def stage_ic(self): # Atm ICs if self.app_config.do_atm: - pslot = self._base['PSLOT'] - if ( 'BASE_CPLIC' in cpl_ic.keys() ): - base_cplic = f"{cpl_ic['BASE_CPLIC']}" - else: - base_cplic = os.environ.get('BASE_CPLIC') - if ( 'CPL_ATMIC' in cpl_ic.keys() ): - cpl_atmic = f"{cpl_ic['CPL_ATMIC']}" - else: - cpl_atmic = os.environ.get('CPL_ATMIC') - - prefix = f"{base_cplic}/{cpl_atmic}/@Y@m@d@H/atmos" - - pw_csp = os.environ.get('PW_CSP') - use_ufs_utils_format = os.environ.get('USE_UFS_UTILS_FORMAT', False) - if ( pw_csp in ['aws', 'azure', 'google'] or use_ufs_utils_format): - icdir = f"{base_cplic}/{cpl_atmic}" - - if('IC_PREFIX' in cpl_ic.keys()): - cpl_ic_prefix = f"{icdir}/{cpl_ic['IC_PREFIX']}" - else: - cpl_ic_prefix = 'gfs' - if('IC_TYPE' in cpl_ic.keys()): - cpl_ic_type = f"{cpl_ic['IC_TYPE']}" - else: - cpl_ic_type = 'input' - prefix = f"{icdir}/{cpl_ic_prefix}.@Y@m@d/@H/model_data/atmos/{cpl_ic_type}" + if ('BASE_CPLIC' not in cpl_ic.keys()): + cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs') + prefix = f"{cpl_ic['BASE_CPLIC']}/{cpl_ic['CPL_ATMIC']}/@Y@m@d@H/atmos" for file in ['gfs_ctrl.nc'] + \ [f'{datatype}_data.tile{tile}.nc' for datatype in ['gfs', 'sfc'] @@ -509,10 +485,35 @@ def atmanlfinal(self): return task + def prepobsaero(self): + deps = [] + dep_dict = {'type': 'task', 'name': f'{self.cdump}prep'} + deps.append(rocoto.add_dependency(dep_dict)) + dependencies = rocoto.create_dependency(dep_condition='and', dep=deps) + + resources = self.get_resource('prepobsaero') + task_name = f'{self.cdump}prepobsaero' + task_dict = {'task_name': task_name, + 'resources': resources, + 'dependency': dependencies, + 'envars': self.envars, + 'cycledef': self.cdump.replace('enkf', ''), + 'command': f'{self.HOMEgfs}/jobs/rocoto/prepobsaero.sh', + 'job_name': f'{self.pslot}_{task_name}_@H', + 'log': f'{self.rotdir}/logs/@Y@m@d@H/{task_name}.log', + 'maxtries': '&MAXTRIES;' + } + + task = rocoto.create_task(task_dict) + + return task + def aeroanlinit(self): deps = [] dep_dict = {'type': 'task', 'name': f'{self.cdump}prep'} + if self.app_config.do_prep_obs_aero: + dep_dict = {'type': 'task', 'name': f'{self.cdump}prepobsaero'} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep_condition='and', dep=deps) @@ -1201,9 +1202,13 @@ def wavepostbndpnt(self): return task def wavepostbndpntbll(self): + + # The wavepostbndpntbll job runs on forecast hours up to FHMAX_WAV_IBP + last_fhr = self._configs['wave']['FHMAX_WAV_IBP'] + deps = [] atmos_hist_path = self._template_to_rocoto_cycstring(self._base["COM_ATMOS_HISTORY_TMPL"]) - data = f'{atmos_hist_path}/{self.cdump}.t@Hz.atm.logf180.txt' + data = f'{atmos_hist_path}/{self.cdump}.t@Hz.atm.logf{last_fhr:03d}.txt' dep_dict = {'type': 'data', 'data': data} deps.append(rocoto.add_dependency(dep_dict)) dependencies = rocoto.create_dependency(dep=deps) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index 11b2cdfc45..c586fba0f5 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -8,6 +8,7 @@ from applications.applications import AppConfig from rocoto.workflow_tasks import get_wf_tasks import rocoto.rocoto as rocoto +import numpy as np from abc import ABC, abstractmethod @@ -156,11 +157,21 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: replyto = '' strings = ['', - f'#################### {pslot} ####################', - f'MAILTO="{replyto}"', - f'{cronintstr} {rocotorunstr}', - '#################################################################', - ''] + f'#################### {pslot} ####################', + f'MAILTO="{replyto}"' + ] + pw_csp = os.environ.get('PW_CSP') + if ( pw_csp in ['aws', 'azure', 'google'] ): + strings = np.append(strings, + [ + f'SHELL="/bin/bash"', + f'BASH_ENV="/etc/bashrc"' + ]) + strings = np.append(strings, + [ + f'{cronintstr} {rocotorunstr}', + '#################################################################', + '']) if crontab_file is None: crontab_file = f"{expdir}/{pslot}.crontab" diff --git a/workflow/setup_expt.py b/workflow/setup_expt.py index 97d25dc15a..b44842b982 100755 --- a/workflow/setup_expt.py +++ b/workflow/setup_expt.py @@ -287,6 +287,8 @@ def _update_defaults(dict_in: dict) -> dict: data = AttrDict(host.info, **inputs.__dict__) data.HOMEgfs = _top yaml_path = inputs.yaml + if not os.path.exists(yaml_path): + raise IOError(f'YAML file does not exist, check path:' + yaml_path) yaml_dict = _update_defaults(AttrDict(parse_j2yaml(yaml_path, data))) # First update config.base From 3de972f528462baeebf0896f196d603c566ad722 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 21 Jun 2024 16:23:45 +0000 Subject: [PATCH 03/49] make change in tasks.py to avoid error finding libiomp5.so problem --- parm/config/gfs/config.resources | 2 +- workflow/rocoto/tasks.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index f006b8e7f0..d82ca836a5 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -812,7 +812,7 @@ case ${step} in ;; "atmos_products") - export wtime_atmos_products="00:15:00" + export wtime_atmos_products="00:45:00" export npe_atmos_products=24 export nth_atmos_products=1 export npe_node_atmos_products="${npe_atmos_products}" diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index ad135be713..abf40d48b5 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import os import numpy as np from applications.applications import AppConfig import rocoto.rocoto as rocoto @@ -214,7 +215,11 @@ def get_resource(self, task_name): else: native += ':shared' elif scheduler in ['slurm']: - native = '--export=NONE' + pw_csp = os.environ.get('PW_CSP', 'unknown') + if ( pw_csp in ['aws', 'azure', 'google'] ): + native = '--export=ALL --exclusive' + else: + native = '--export=NONE' if task_config['RESERVATION'] != "": native += '' if task_name in Tasks.SERVICE_TASKS else ' --reservation=' + task_config['RESERVATION'] From bc4c4a830640da698612c444816ab4aae5d6b892 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Sat, 22 Jun 2024 21:05:03 +0000 Subject: [PATCH 04/49] add comments so the reviewers know that these changes are for AWS, and hope can provide a better way --- ush/forecast_postdet.sh | 26 +++++++++++++++----------- workflow/rocoto/gefs_tasks.py | 2 ++ workflow/rocoto/gfs_tasks.py | 2 ++ workflow/rocoto/tasks.py | 4 ++++ 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/ush/forecast_postdet.sh b/ush/forecast_postdet.sh index 2cc34eaacd..253138ce01 100755 --- a/ush/forecast_postdet.sh +++ b/ush/forecast_postdet.sh @@ -268,20 +268,24 @@ FV3_out() { fi fi - # Get list of FV3 restart files - local file_list fv3_file - file_list=$(FV3_restarts) + ### Check that there are restart files to copy + #if [[ ${#restart_dates} -gt 0 ]]; then + if [[ -n ${restart_dates} ]]; then + # Get list of FV3 restart files + local file_list fv3_file + file_list=$(FV3_restarts) - # Copy restarts for the dates collected above to COM - for restart_date in "${restart_dates[@]}"; do - echo "Copying FV3 restarts for 'RUN=${RUN}' at ${restart_date}" - for fv3_file in ${file_list}; do - ${NCP} "${DATArestart}/FV3_RESTART/${restart_date}.${fv3_file}" \ - "${COMOUT_ATMOS_RESTART}/${restart_date}.${fv3_file}" + # Copy restarts for the dates collected above to COM + for restart_date in "${restart_dates[@]}"; do + echo "Copying FV3 restarts for 'RUN=${RUN}' at ${restart_date}" + for fv3_file in ${file_list}; do + ${NCP} "${DATArestart}/FV3_RESTART/${restart_date}.${fv3_file}" \ + "${COMOUT_ATMOS_RESTART}/${restart_date}.${fv3_file}" + done done - done - echo "SUB ${FUNCNAME[0]}: Output data for FV3 copied" + echo "SUB ${FUNCNAME[0]}: Output data for FV3 copied" + fi } # Disable variable not used warnings diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index 8fadafe9b0..26ba33274b 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -11,6 +11,8 @@ def __init__(self, app_config: AppConfig, cdump: str) -> None: def stage_ic(self): cpl_ic = self._configs['stage_ic'] + #The if block below is added for AWS. + #If we have a proper way to define 'BASE_CPLIC', this if block can be removed. if ('BASE_CPLIC' not in cpl_ic.keys()): cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs') deps = [] diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 7c3ffb0cc9..c2bddc4544 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -24,6 +24,8 @@ def stage_ic(self): # Atm ICs if self.app_config.do_atm: + #The if block below is added for AWS. + #If we have a proper way to define 'BASE_CPLIC', this if block can be removed. if ('BASE_CPLIC' not in cpl_ic.keys()): cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs') prefix = f"{cpl_ic['BASE_CPLIC']}/{cpl_ic['CPL_ATMIC']}/@Y@m@d@H/atmos" diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index abf40d48b5..104743cfe4 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -215,6 +215,10 @@ def get_resource(self, task_name): else: native += ':shared' elif scheduler in ['slurm']: + #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, we need 'native' defined + #as below. Or, it won't run, but with an error: + #"ufs_model.x: error while loading shared libraries: libiomp5.so: cannot open shared object file: No such file or directory" + #Even the library path is clearly in LD_LIBRARY_PATH, or load exactly the modules when build ufs_model.x pw_csp = os.environ.get('PW_CSP', 'unknown') if ( pw_csp in ['aws', 'azure', 'google'] ): native = '--export=ALL --exclusive' From b7249374aba29ade6587686925f161836ac5c436 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Sat, 22 Jun 2024 21:10:14 +0000 Subject: [PATCH 05/49] add comments so the reviewers know that these changes are for AWS, and hope can provide a better way --- workflow/rocoto/workflow_xml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index c586fba0f5..ead451afb1 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -160,6 +160,7 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: f'#################### {pslot} ####################', f'MAILTO="{replyto}"' ] + #AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. pw_csp = os.environ.get('PW_CSP') if ( pw_csp in ['aws', 'azure', 'google'] ): strings = np.append(strings, From 12ab29fe6a783d12619ab6f5b9f6d429c160e743 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 25 Jun 2024 20:56:25 +0000 Subject: [PATCH 06/49] reverse config.resource changes, and memory restriction on AWS --- parm/config/gfs/config.resources | 2 +- ush/forecast_postdet.sh | 6 ++++-- workflow/rocoto/tasks.py | 14 ++++++++++---- workflow/rocoto/workflow_xml.py | 5 ++--- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index d82ca836a5..f006b8e7f0 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -812,7 +812,7 @@ case ${step} in ;; "atmos_products") - export wtime_atmos_products="00:45:00" + export wtime_atmos_products="00:15:00" export npe_atmos_products=24 export nth_atmos_products=1 export npe_node_atmos_products="${npe_atmos_products}" diff --git a/ush/forecast_postdet.sh b/ush/forecast_postdet.sh index 253138ce01..0d43f02a57 100755 --- a/ush/forecast_postdet.sh +++ b/ush/forecast_postdet.sh @@ -255,22 +255,24 @@ FV3_out() { # Determine the dates for restart files to be copied to COM local restart_date restart_dates restart_dates=() + number_of_restart_dates=0 # Copy restarts in the assimilation window for RUN=gdas|enkfgdas|enkfgfs if [[ "${RUN}" =~ "gdas" || "${RUN}" == "enkfgfs" ]]; then restart_date="${model_start_date_next_cycle}" while (( restart_date <= forecast_end_cycle )); do restart_dates+=("${restart_date:0:8}.${restart_date:8:2}0000") + number_of_restart_dates=$((number_of_restart_dates + 1)) restart_date=$(date --utc -d "${restart_date:0:8} ${restart_date:8:2} + ${restart_interval} hours" +%Y%m%d%H) done elif [[ "${RUN}" == "gfs" || "${RUN}" == "gefs" ]]; then # Copy restarts at the end of the forecast segment for RUN=gfs|gefs if [[ "${COPY_FINAL_RESTARTS}" == "YES" ]]; then restart_dates+=("${forecast_end_cycle:0:8}.${forecast_end_cycle:8:2}0000") + number_of_restart_dates=$((number_of_restart_dates + 1)) fi fi ### Check that there are restart files to copy - #if [[ ${#restart_dates} -gt 0 ]]; then - if [[ -n ${restart_dates} ]]; then + if [[ ${number_of_restart_dates} -gt 0 ]]; then # Get list of FV3 restart files local file_list fv3_file file_list=$(FV3_restarts) diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index 104743cfe4..fab753bd07 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -197,10 +197,16 @@ def get_resource(self, task_name): if self.cdump in ['gfs'] and f'nth_{task_name}_gfs' in task_config.keys(): threads = task_config[f'nth_{task_name}_gfs'] - memory = task_config.get(f'memory_{task_name}', None) - if scheduler in ['pbspro']: - if task_config.get('prepost', False): - memory += ':prepost=true' + #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, cannot define 'memory' here, + #Or the arch and cleanup will hang. + pw_csp = os.environ.get('PW_CSP', 'unknown') + if ( pw_csp in ['aws', 'azure', 'google'] ): + memory = None + else: + memory = task_config.get(f'memory_{task_name}', None) + if scheduler in ['pbspro']: + if task_config.get('prepost', False): + memory += ':prepost=true' native = None if scheduler in ['pbspro']: diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index ead451afb1..1b0c7431fc 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -8,7 +8,6 @@ from applications.applications import AppConfig from rocoto.workflow_tasks import get_wf_tasks import rocoto.rocoto as rocoto -import numpy as np from abc import ABC, abstractmethod @@ -163,12 +162,12 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: #AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. pw_csp = os.environ.get('PW_CSP') if ( pw_csp in ['aws', 'azure', 'google'] ): - strings = np.append(strings, + strings.extend( [ f'SHELL="/bin/bash"', f'BASH_ENV="/etc/bashrc"' ]) - strings = np.append(strings, + strings.extend( [ f'{cronintstr} {rocotorunstr}', '#################################################################', From 2290ea28637152fdd89cb8ccc5866a531e308a35 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 26 Jun 2024 14:13:23 +0000 Subject: [PATCH 07/49] move common data to a shared place --- sorc/link_workflow.sh | 2 +- workflow/hosts/awspw.yaml | 3 ++- workflow/rocoto/workflow_xml.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index 48eb626633..e6ea511c66 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -76,7 +76,7 @@ case "${machine}" in "jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;; "s4") FIX_DIR="/data/prod/glopara/fix" ;; "gaea") FIX_DIR="/gpfs/f5/epic/proj-shared/global/glopara/data/fix" ;; - "noaacloud") FIX_DIR="/contrib/Wei.Huang/data/hack-orion/fix" ;; + "noaacloud") FIX_DIR="/contrib/global-workflow-shared-data/fix" ;; *) echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR" exit 1 diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index 80f16fbc5b..d651926a2c 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -18,7 +18,8 @@ CHGRP_RSTPROD: 'YES' CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. -BASE_CPLIC: '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs' +#BASE_CPLIC: '/contrib/global-workflow-shared-data/ICDIRS/prototype_ICs' +BASE_CPLIC: '/s3bucket/global-workflow-shared-data/ICDIRS/prototype_ICs' LOCALARCH: 'NO' ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. MAKE_NSSTBUFR: 'NO' diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index 1b0c7431fc..29671fc5ed 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -160,7 +160,7 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: f'MAILTO="{replyto}"' ] #AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. - pw_csp = os.environ.get('PW_CSP') + pw_csp = os.environ.get('PW_CSP', None) if ( pw_csp in ['aws', 'azure', 'google'] ): strings.extend( [ From cd2c8e71d3b533007d4b678cd783cbe20c31451c Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 26 Jun 2024 15:51:05 +0000 Subject: [PATCH 08/49] use ICs from s3-bucket --- workflow/hosts/awspw.yaml | 4 ++-- workflow/rocoto/tasks.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index d651926a2c..2b065a2d61 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -18,8 +18,8 @@ CHGRP_RSTPROD: 'YES' CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. -#BASE_CPLIC: '/contrib/global-workflow-shared-data/ICDIRS/prototype_ICs' -BASE_CPLIC: '/s3bucket/global-workflow-shared-data/ICDIRS/prototype_ICs' +#BASE_CPLIC: '/contrib/global-workflow-shared-data/ICSDIR/prototype_ICs' +BASE_CPLIC: '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs' LOCALARCH: 'NO' ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. MAKE_NSSTBUFR: 'NO' diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index 5c5792f26c..dbd570eb7d 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -177,6 +177,13 @@ def get_resource(self, task_name): task_config = self._configs[task_name] + #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, + #use "$USER" as account. + pw_csp = os.environ.get('PW_CSP', None) + if ( pw_csp in ['aws', 'azure', 'google'] ): + task_config['ACCOUNT'] = os.environ.get('USER') + task_config['ACCOUNT_SERVICE'] = os.environ.get('USER') + account = task_config['ACCOUNT_SERVICE'] if task_name in Tasks.SERVICE_TASKS else task_config['ACCOUNT'] if f'wtime_{task_name}_{self.cdump}' in task_config: @@ -209,7 +216,6 @@ def get_resource(self, task_name): #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, cannot define 'memory' here, #Or the arch and cleanup will hang. - pw_csp = os.environ.get('PW_CSP', 'unknown') if ( pw_csp in ['aws', 'azure', 'google'] ): memory = None else: From 46e3ef543c95d1cf0e717fb60f5d2845dbb90c1c Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 2 Jul 2024 13:44:08 +0000 Subject: [PATCH 09/49] change as suggested by reviewer --- workflow/rocoto/workflow_xml.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index 29671fc5ed..8c859db25e 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -160,8 +160,7 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: f'MAILTO="{replyto}"' ] #AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. - pw_csp = os.environ.get('PW_CSP', None) - if ( pw_csp in ['aws', 'azure', 'google'] ): + if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']: strings.extend( [ f'SHELL="/bin/bash"', From a34a4c85e269d2cf5469fba44b702fcd2e6812b5 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 4 Jul 2024 21:22:52 +0000 Subject: [PATCH 10/49] sync sorc/ufs_model.fd --- sorc/ufs_model.fd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index a183a52151..e784814dfc 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit a183a521516110cc9bcb86d853bd9b0dccef5bc7 +Subproject commit e784814dfce3fb01e82be6d3949f9811860041d7 From 44011a39701a6d65390ffba31b60dc7d6369e0c4 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 4 Jul 2024 21:28:07 +0000 Subject: [PATCH 11/49] remove mpmd_opt from APRUN_UFS --- env/AWSPW.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index 54f2643b1a..cc7c94b99b 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -29,7 +29,7 @@ if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then (( nnodes = (${!nprocs}+${!ppn}-1)/${!ppn} )) (( ntasks = nnodes*${!ppn} )) # With ESMF threading, the model wants to use the full node - export APRUN_UFS="${launcher} -n ${ntasks} ${mpmd_opt}" + export APRUN_UFS="${launcher} -n ${ntasks}" unset nprocs ppn nnodes ntasks elif [[ "${step}" = "post" ]]; then From 965ec80e857db312f9e9411cea805387cbd0bbbf Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 5 Jul 2024 15:23:37 +0000 Subject: [PATCH 12/49] mpmd_opt and switch off tracker/genesis default for AWS --- env/AWSPW.env | 2 +- parm/config/gfs/config.base | 7 +++++++ workflow/rocoto/gfs_tasks.py | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index cc7c94b99b..751f52db41 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -10,7 +10,7 @@ fi step=$1 export launcher="srun --mpi=pmi2 -l" -export mpmd_opt="--distribution=block:block --hint=nomultithread --cpus-per-task=1" +export mpmd_opt="" # Configure MPI environment export OMP_STACKSIZE=2048000 diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index f78c7fb400..804760bb5e 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -478,4 +478,11 @@ export OFFSET_START_HOUR=0 # Number of regional collectives to create soundings for export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9} +# The tracker, genesis, and METplus jobs are not supported on AWS yet +if [[ "${machine}" == "AWSPW" ]]; then + export DO_TRACKER="NO" + export DO_GENESIS="NO" + export DO_METP="NO" +fi + echo "END: config.base" diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index 99299950d0..581a483564 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -27,7 +27,9 @@ def stage_ic(self): #The if block below is added for AWS. #If we have a proper way to define 'BASE_CPLIC', this if block can be removed. if ('BASE_CPLIC' not in cpl_ic.keys()): - cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs') + cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs') + if ('CPL_ATMIC' not in cpl_ic.keys()): + cpl_ic['CPL_ATMIC'] = os.environ.get('CPL_ATMIC', 'workflow_C48_refactored') prefix = f"{cpl_ic['BASE_CPLIC']}/{cpl_ic['CPL_ATMIC']}/@Y@m@d@H/atmos" for file in ['gfs_ctrl.nc'] + \ [f'{datatype}_data.tile{tile}.nc' From 3ce268eafa275c6f44a75de5cd061bafeb331a56 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 5 Jul 2024 15:49:03 +0000 Subject: [PATCH 13/49] add TODO --- parm/config/gfs/config.base | 1 + 1 file changed, 1 insertion(+) diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 804760bb5e..43ab218b90 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -479,6 +479,7 @@ export OFFSET_START_HOUR=0 export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9} # The tracker, genesis, and METplus jobs are not supported on AWS yet +# TODO: we should place these in workflow/hosts/awspw.yaml as part of AWS setup, not for general. if [[ "${machine}" == "AWSPW" ]]; then export DO_TRACKER="NO" export DO_GENESIS="NO" From f03ac786a2db70b08b768acec79a56266726999e Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Sat, 6 Jul 2024 22:02:39 +0000 Subject: [PATCH 14/49] remove ncl version on AWS --- modulefiles/module_base.noaacloud.lua | 2 -- sorc/ufs_model.fd | 2 +- versions/run.noaacloud.ver | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua index fb5b283087..6d2182062e 100644 --- a/modulefiles/module_base.noaacloud.lua +++ b/modulefiles/module_base.noaacloud.lua @@ -9,9 +9,7 @@ load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None"))) load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None"))) load(pathJoin("python", (os.getenv("python_ver") or "None"))) ---load(pathJoin("hpss", (os.getenv("hpss_ver") or "None"))) load(pathJoin("gempak", (os.getenv("gempak_ver") or "None"))) -load(pathJoin("ncl", (os.getenv("ncl_ver") or "None"))) load(pathJoin("jasper", (os.getenv("jasper_ver") or "None"))) load(pathJoin("libpng", (os.getenv("libpng_ver") or "None"))) load(pathJoin("cdo", (os.getenv("cdo_ver") or "None"))) diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index e784814dfc..a183a52151 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit e784814dfce3fb01e82be6d3949f9811860041d7 +Subproject commit a183a521516110cc9bcb86d853bd9b0dccef5bc7 diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver index 7c23da0e7a..c85d29a71a 100644 --- a/versions/run.noaacloud.ver +++ b/versions/run.noaacloud.ver @@ -7,5 +7,4 @@ export gempak_ver=7.4.2 source "${HOMEgfs:-}/versions/run.spack.ver" export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" -export ncl_ver=6.6.2 export cdo_ver=2.2.0 From 2f6ec6ed1bb0faaf88bbb0b199f0e49c3d31f3b6 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Sat, 6 Jul 2024 22:25:57 +0000 Subject: [PATCH 15/49] sync ufs_model --- sorc/ufs_model.fd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index e784814dfc..a183a52151 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit e784814dfce3fb01e82be6d3949f9811860041d7 +Subproject commit a183a521516110cc9bcb86d853bd9b0dccef5bc7 From e8a2e0facc7ba249c1d74639a6eb7c9a44ea9b69 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 12 Jul 2024 19:31:25 +0000 Subject: [PATCH 16/49] sync and remove gempak from noaacloud --- modulefiles/module_base.noaacloud.lua | 1 - sorc/gfs_utils.fd | 2 +- sorc/gsi_enkf.fd | 2 +- sorc/gsi_monitor.fd | 2 +- sorc/gsi_utils.fd | 2 +- sorc/ufs_utils.fd | 2 +- sorc/wxflow | 2 +- versions/run.noaacloud.ver | 2 -- workflow/rocoto/tasks.py | 1 + 9 files changed, 7 insertions(+), 9 deletions(-) diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua index 6d2182062e..113409e41d 100644 --- a/modulefiles/module_base.noaacloud.lua +++ b/modulefiles/module_base.noaacloud.lua @@ -9,7 +9,6 @@ load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None"))) load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None"))) load(pathJoin("python", (os.getenv("python_ver") or "None"))) -load(pathJoin("gempak", (os.getenv("gempak_ver") or "None"))) load(pathJoin("jasper", (os.getenv("jasper_ver") or "None"))) load(pathJoin("libpng", (os.getenv("libpng_ver") or "None"))) load(pathJoin("cdo", (os.getenv("cdo_ver") or "None"))) diff --git a/sorc/gfs_utils.fd b/sorc/gfs_utils.fd index 02ce084c24..0cdc279526 160000 --- a/sorc/gfs_utils.fd +++ b/sorc/gfs_utils.fd @@ -1 +1 @@ -Subproject commit 02ce084c244823e22661d493a50236b7d5eaf70a +Subproject commit 0cdc2795260fc1b59e86a873729433a470794a97 diff --git a/sorc/gsi_enkf.fd b/sorc/gsi_enkf.fd index 529bb796be..8e279f9c73 160000 --- a/sorc/gsi_enkf.fd +++ b/sorc/gsi_enkf.fd @@ -1 +1 @@ -Subproject commit 529bb796bea0e490f186729cd168a91c034bb12d +Subproject commit 8e279f9c734097f673b07e80f385b2623d13ba4a diff --git a/sorc/gsi_monitor.fd b/sorc/gsi_monitor.fd index e1f9f21af1..f9d6f5f744 160000 --- a/sorc/gsi_monitor.fd +++ b/sorc/gsi_monitor.fd @@ -1 +1 @@ -Subproject commit e1f9f21af16ce912fdc2cd75c5b27094a550a0c5 +Subproject commit f9d6f5f744462a449e70abed8c5860b1c4564ad8 diff --git a/sorc/gsi_utils.fd b/sorc/gsi_utils.fd index 9382fd01c2..4332814529 160000 --- a/sorc/gsi_utils.fd +++ b/sorc/gsi_utils.fd @@ -1 +1 @@ -Subproject commit 9382fd01c2a626c8934c3f553d420a45de2b4dec +Subproject commit 4332814529465ab8eb58e43a38227b952ebfca49 diff --git a/sorc/ufs_utils.fd b/sorc/ufs_utils.fd index 3ef2e6bd72..2794d413d0 160000 --- a/sorc/ufs_utils.fd +++ b/sorc/ufs_utils.fd @@ -1 +1 @@ -Subproject commit 3ef2e6bd725d2662fd6ee95897cb7bac222e5144 +Subproject commit 2794d413d083b43d9ba37a15375d5c61b610d29e diff --git a/sorc/wxflow b/sorc/wxflow index 1356acdb2b..5dad7dd61c 160000 --- a/sorc/wxflow +++ b/sorc/wxflow @@ -1 +1 @@ -Subproject commit 1356acdb2bbca28e442597699da1a295faa18fe3 +Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9 diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver index c85d29a71a..4c9ac3cd42 100644 --- a/versions/run.noaacloud.ver +++ b/versions/run.noaacloud.ver @@ -2,8 +2,6 @@ export stack_intel_ver=2021.3.0 export stack_impi_ver=2021.3.0 export spack_env=gsi-addon-env -export gempak_ver=7.4.2 - source "${HOMEgfs:-}/versions/run.spack.ver" export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index 785ced9900..fc47efb800 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -241,6 +241,7 @@ def get_resource(self, task_name): #as below. Or, it won't run, but with an error: #"ufs_model.x: error while loading shared libraries: libiomp5.so: cannot open shared object file: No such file or directory" #Even the library path is clearly in LD_LIBRARY_PATH, or load exactly the modules when build ufs_model.x + #TODO: find a mechanism to provide native scheduler information. pw_csp = os.environ.get('PW_CSP', 'unknown') if ( pw_csp in ['aws', 'azure', 'google'] ): native = '--export=ALL --exclusive' From a548c7f92e5012e79b4fdc15e0691b92e6838918 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Mon, 15 Jul 2024 19:18:47 +0000 Subject: [PATCH 17/49] update modules hash --- sorc/gdas.cd | 2 +- sorc/ufs_utils.fd | 2 +- sorc/wxflow | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sorc/gdas.cd b/sorc/gdas.cd index e3644a98c3..368c9c5db9 160000 --- a/sorc/gdas.cd +++ b/sorc/gdas.cd @@ -1 +1 @@ -Subproject commit e3644a98c362d7321f9e3081a4e55947885ed2bf +Subproject commit 368c9c5db9b5ea62e72937b6d1b0f753adb9be40 diff --git a/sorc/ufs_utils.fd b/sorc/ufs_utils.fd index 2794d413d0..65b530560c 160000 --- a/sorc/ufs_utils.fd +++ b/sorc/ufs_utils.fd @@ -1 +1 @@ -Subproject commit 2794d413d083b43d9ba37a15375d5c61b610d29e +Subproject commit 65b530560c0a1620982d1857fdb36d65be17b867 diff --git a/sorc/wxflow b/sorc/wxflow index 5dad7dd61c..8406beeea4 160000 --- a/sorc/wxflow +++ b/sorc/wxflow @@ -1 +1 @@ -Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9 +Subproject commit 8406beeea410118cdfbd8300895b2b2878eadba6 From d37e646170b9623a860f8f8e09fda804b67e60da Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Mon, 15 Jul 2024 20:39:46 +0000 Subject: [PATCH 18/49] update module hash --- sorc/gdas.cd | 2 +- sorc/ufs_utils.fd | 2 +- sorc/wxflow | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sorc/gdas.cd b/sorc/gdas.cd index 368c9c5db9..e3644a98c3 160000 --- a/sorc/gdas.cd +++ b/sorc/gdas.cd @@ -1 +1 @@ -Subproject commit 368c9c5db9b5ea62e72937b6d1b0f753adb9be40 +Subproject commit e3644a98c362d7321f9e3081a4e55947885ed2bf diff --git a/sorc/ufs_utils.fd b/sorc/ufs_utils.fd index 65b530560c..2794d413d0 160000 --- a/sorc/ufs_utils.fd +++ b/sorc/ufs_utils.fd @@ -1 +1 @@ -Subproject commit 65b530560c0a1620982d1857fdb36d65be17b867 +Subproject commit 2794d413d083b43d9ba37a15375d5c61b610d29e diff --git a/sorc/wxflow b/sorc/wxflow index 8406beeea4..5dad7dd61c 160000 --- a/sorc/wxflow +++ b/sorc/wxflow @@ -1 +1 @@ -Subproject commit 8406beeea410118cdfbd8300895b2b2878eadba6 +Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9 From 2a8016206440a04f5fef233cbdfca57df72f5ee9 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 17 Jul 2024 19:19:40 +0000 Subject: [PATCH 19/49] use bucket --- workflow/hosts/awspw.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index 20d87d5acc..3941964dae 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -19,7 +19,6 @@ CHGRP_RSTPROD: 'YES' CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. -#BASE_CPLIC: '/contrib/global-workflow-shared-data/ICSDIR/prototype_ICs' BASE_CPLIC: '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs' LOCALARCH: 'NO' ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. From fa448626aba1b11bdabd2eabedc1366b8064183d Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 17 Jul 2024 19:33:49 +0000 Subject: [PATCH 20/49] remove /scratch1, but kept TODO --- workflow/hosts/awspw.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index 3941964dae..81678004dc 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -1,7 +1,7 @@ -BASE_GIT: '/scratch1/NCEPDEV/global/glopara/git' #TODO: This does not yet exist. -DMPDIR: '/scratch1/NCEPDEV/global/glopara/dump' # TODO: This does not yet exist. -PACKAGEROOT: '/scratch1/NCEPDEV/global/glopara/nwpara' #TODO: This does not yet exist. -COMINsyn: '/scratch1/NCEPDEV/global/glopara/com/gfs/prod/syndat' #TODO: This does not yet exist. +BASE_GIT: '' #TODO: This does not yet exist. +DMPDIR: '' # TODO: This does not yet exist. +PACKAGEROOT: '' #TODO: This does not yet exist. +COMINsyn: '' #TODO: This does not yet exist. HOMEDIR: '/contrib/${USER}' STMP: '/lustre/${USER}/stmp/' PTMP: '/lustre/${USER}/ptmp/' @@ -21,7 +21,7 @@ HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. BASE_CPLIC: '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs' LOCALARCH: 'NO' -ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. +ATARDIR: '' # TODO: This will not yet work from AWS. MAKE_NSSTBUFR: 'NO' MAKE_ACFTBUFR: 'NO' SUPPORTED_RESOLUTIONS: ['C48', 'C96'] # TODO: Test and support all cubed-sphere resolutions. From 07851dc44c988f93fc75beed5cf4bcf8ef980d7d Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jul 2024 22:09:28 +0000 Subject: [PATCH 21/49] re-sync --- parm/config/gfs/config.resources.AWSPW | 8 ++++++++ sorc/gfs_utils.fd | 2 +- sorc/gsi_enkf.fd | 2 +- sorc/gsi_monitor.fd | 2 +- sorc/gsi_utils.fd | 2 +- sorc/ufs_model.fd | 2 +- sorc/ufs_utils.fd | 2 +- sorc/wxflow | 2 +- workflow/rocoto/tasks.py | 14 -------------- 9 files changed, 15 insertions(+), 21 deletions(-) create mode 100644 parm/config/gfs/config.resources.AWSPW diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW new file mode 100644 index 0000000000..48a4440fe2 --- /dev/null +++ b/parm/config/gfs/config.resources.AWSPW @@ -0,0 +1,8 @@ +#! /usr/bin/env bash + +# AWS-specific job resources + +# shellcheck disable=SC2312 +for mem_var in $(env | grep '^memory_' | cut -d= -f1); do + unset "${mem_var}" +done diff --git a/sorc/gfs_utils.fd b/sorc/gfs_utils.fd index 0cdc279526..02ce084c24 160000 --- a/sorc/gfs_utils.fd +++ b/sorc/gfs_utils.fd @@ -1 +1 @@ -Subproject commit 0cdc2795260fc1b59e86a873729433a470794a97 +Subproject commit 02ce084c244823e22661d493a50236b7d5eaf70a diff --git a/sorc/gsi_enkf.fd b/sorc/gsi_enkf.fd index 8e279f9c73..529bb796be 160000 --- a/sorc/gsi_enkf.fd +++ b/sorc/gsi_enkf.fd @@ -1 +1 @@ -Subproject commit 8e279f9c734097f673b07e80f385b2623d13ba4a +Subproject commit 529bb796bea0e490f186729cd168a91c034bb12d diff --git a/sorc/gsi_monitor.fd b/sorc/gsi_monitor.fd index f9d6f5f744..e1f9f21af1 160000 --- a/sorc/gsi_monitor.fd +++ b/sorc/gsi_monitor.fd @@ -1 +1 @@ -Subproject commit f9d6f5f744462a449e70abed8c5860b1c4564ad8 +Subproject commit e1f9f21af16ce912fdc2cd75c5b27094a550a0c5 diff --git a/sorc/gsi_utils.fd b/sorc/gsi_utils.fd index 4332814529..9382fd01c2 160000 --- a/sorc/gsi_utils.fd +++ b/sorc/gsi_utils.fd @@ -1 +1 @@ -Subproject commit 4332814529465ab8eb58e43a38227b952ebfca49 +Subproject commit 9382fd01c2a626c8934c3f553d420a45de2b4dec diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index a183a52151..e784814dfc 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit a183a521516110cc9bcb86d853bd9b0dccef5bc7 +Subproject commit e784814dfce3fb01e82be6d3949f9811860041d7 diff --git a/sorc/ufs_utils.fd b/sorc/ufs_utils.fd index 2794d413d0..3ef2e6bd72 160000 --- a/sorc/ufs_utils.fd +++ b/sorc/ufs_utils.fd @@ -1 +1 @@ -Subproject commit 2794d413d083b43d9ba37a15375d5c61b610d29e +Subproject commit 3ef2e6bd725d2662fd6ee95897cb7bac222e5144 diff --git a/sorc/wxflow b/sorc/wxflow index 5dad7dd61c..1356acdb2b 160000 --- a/sorc/wxflow +++ b/sorc/wxflow @@ -1 +1 @@ -Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9 +Subproject commit 1356acdb2bbca28e442597699da1a295faa18fe3 diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index fc47efb800..a852757512 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -177,13 +177,6 @@ def get_resource(self, task_name): task_config = self._configs[task_name] - #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, - #use "$USER" as account. - pw_csp = os.environ.get('PW_CSP', None) - if ( pw_csp in ['aws', 'azure', 'google'] ): - task_config['ACCOUNT'] = os.environ.get('USER') - task_config['ACCOUNT_SERVICE'] = os.environ.get('USER') - account = task_config['ACCOUNT_SERVICE'] if task_name in Tasks.SERVICE_TASKS else task_config['ACCOUNT'] if f'wtime_{task_name}_{self.cdump}' in task_config: @@ -213,13 +206,6 @@ def get_resource(self, task_name): else: # Memory is not required memory = task_config.get(f'memory_{task_name}', None) - - #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, cannot define 'memory' here, - #Or the arch and cleanup will hang. - if ( pw_csp in ['aws', 'azure', 'google'] ): - memory = None - else: - memory = task_config.get(f'memory_{task_name}', None) if scheduler in ['pbspro']: if task_config.get('prepost', False): memory += ':prepost=true' From 492808df04265f480c7c27da350245b346cff7cc Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 19 Jul 2024 22:42:38 +0000 Subject: [PATCH 22/49] sync --- sorc/ufs_model.fd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index e784814dfc..6a6ce431e4 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit e784814dfce3fb01e82be6d3949f9811860041d7 +Subproject commit 6a6ce431e46846a576bef6ebf4be82035c26c7f3 From d7a262e0ff25331728a23c67666bf9cabfb8d397 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 23 Jul 2024 12:58:04 +0000 Subject: [PATCH 23/49] add is_exclusive to resource.AWSPW --- parm/config/gfs/config.resources.AWSPW | 2 ++ workflow/rocoto/tasks.py | 8 +------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW index 48a4440fe2..8649713bb7 100644 --- a/parm/config/gfs/config.resources.AWSPW +++ b/parm/config/gfs/config.resources.AWSPW @@ -2,6 +2,8 @@ # AWS-specific job resources +export is_exclusive="True" + # shellcheck disable=SC2312 for mem_var in $(env | grep '^memory_' | cut -d= -f1); do unset "${mem_var}" diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index a852757512..f506c1d9dc 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -223,13 +223,7 @@ def get_resource(self, task_name): else: native += ':shared' elif scheduler in ['slurm']: - #The PW_CSP is a AWS (CSPs parameter), if it is on CSPs, we need 'native' defined - #as below. Or, it won't run, but with an error: - #"ufs_model.x: error while loading shared libraries: libiomp5.so: cannot open shared object file: No such file or directory" - #Even the library path is clearly in LD_LIBRARY_PATH, or load exactly the modules when build ufs_model.x - #TODO: find a mechanism to provide native scheduler information. - pw_csp = os.environ.get('PW_CSP', 'unknown') - if ( pw_csp in ['aws', 'azure', 'google'] ): + if task_config.get('is_exclusive', False): native = '--export=ALL --exclusive' else: native = '--export=NONE' From af573afb1ef088d322637ef97232964f128d1952 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 23 Jul 2024 15:54:47 +0000 Subject: [PATCH 24/49] sync hash with EMC repo --- sorc/ufs_model.fd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/ufs_model.fd b/sorc/ufs_model.fd index 6a6ce431e4..e784814dfc 160000 --- a/sorc/ufs_model.fd +++ b/sorc/ufs_model.fd @@ -1 +1 @@ -Subproject commit 6a6ce431e46846a576bef6ebf4be82035c26c7f3 +Subproject commit e784814dfce3fb01e82be6d3949f9811860041d7 From 092918053f7f8e8617e891c417a87562890a6544 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 23 Jul 2024 16:57:27 +0000 Subject: [PATCH 25/49] remove --export=ALL from native, when is_exclusive set true --- workflow/rocoto/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index f506c1d9dc..d339deb974 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -224,7 +224,7 @@ def get_resource(self, task_name): native += ':shared' elif scheduler in ['slurm']: if task_config.get('is_exclusive', False): - native = '--export=ALL --exclusive' + native = '--exclusive' else: native = '--export=NONE' if task_config['RESERVATION'] != "": From 77e82335c8f62f4de00749f5b2bc7f74286d227d Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 25 Jul 2024 11:42:36 +0000 Subject: [PATCH 26/49] Make AWS works similar to on-prem machine --- ush/load_ufswm_modules.sh | 45 +++++++++++---------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/ush/load_ufswm_modules.sh b/ush/load_ufswm_modules.sh index 6477a8ff39..f00358095d 100755 --- a/ush/load_ufswm_modules.sh +++ b/ush/load_ufswm_modules.sh @@ -11,40 +11,21 @@ ulimit_s=$( ulimit -S -s ) source "${HOMEgfs}/ush/detect_machine.sh" source "${HOMEgfs}/ush/module-setup.sh" -if [[ "${MACHINE_ID}" != "noaacloud" ]]; then - module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles" - module load "ufs_${MACHINE_ID}.intel" - module load prod_util - if [[ "${MACHINE_ID}" = "wcoss2" ]]; then - module load cray-pals - module load cfp - module load libjpeg - module load craype-network-ucx - module load cray-mpich-ucx - else - export UTILROOT=${prod_util_ROOT} - fi - module load wgrib2 - export WGRIB2=wgrib2 -fi -if [[ "${MACHINE_ID}" == "noaacloud" ]]; then - if [[ "${PW_CSP:-}" = "aws" ]]; then - # TODO: This can be cleaned-up; most of this is a hack for now. - module use "/contrib/spack-stack/envs/ufswm/install/modulefiles/Core" - module load "stack-intel" - module load "stack-intel-oneapi-mpi" - module use -a "/contrib/spack-stack/miniconda/modulefiles/miniconda/" - module load "py39_4.12.0" - module load "ufs-weather-model-env/1.0.0" - export NETCDF="/contrib/spack-stack/miniconda/apps/miniconda/py39_4.12.0" - # TODO: Are there plans for EPIC to maintain this package or should GW provide support? - export UTILROOT="/contrib/global-workflow/NCEPLIBS-prod_util" - export PATH="${PATH}:/contrib/global-workflow/bin" - ndate_path="$(command -v ndate)" - export NDATE="${ndate_path}" - fi +module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles" +module load "ufs_${MACHINE_ID}.intel" +module load prod_util +if [[ "${MACHINE_ID}" = "wcoss2" ]]; then + module load cray-pals + module load cfp + module load libjpeg + module load craype-network-ucx + module load cray-mpich-ucx +else + export UTILROOT=${prod_util_ROOT} fi +module load wgrib2 +export WGRIB2=wgrib2 module list unset MACHINE_ID From 96f73ba6045ae1ac481b1bc4983a15d63c820131 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 25 Jul 2024 12:41:13 +0000 Subject: [PATCH 27/49] remove --export=ALL from 'native' --- workflow/rocoto/tasks.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index a2bac2e571..67c1cff39e 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -import os import numpy as np from applications.applications import AppConfig import rocoto.rocoto as rocoto @@ -206,9 +205,10 @@ def get_resource(self, task_name): else: # Memory is not required memory = task_config.get(f'memory_{task_name}', None) - if scheduler in ['pbspro']: - if task_config.get('prepost', False): - memory += ':prepost=true' + + if scheduler in ['pbspro']: + if task_config.get('prepost', False): + memory += ':prepost=true' native = None if scheduler in ['pbspro']: @@ -224,9 +224,9 @@ def get_resource(self, task_name): native += ':shared' elif scheduler in ['slurm']: if task_config.get('is_exclusive', False): - native = '--exclusive' - else: - native = '--export=NONE' +- native = '--exclusive' +- else: +- native = '--export=NONE' if task_config['RESERVATION'] != "": native += '' if task_name in Tasks.SERVICE_TASKS else ' --reservation=' + task_config['RESERVATION'] if task_config.get('CLUSTERS', "") not in ["", '@CLUSTERS@']: From a33a3be11f94c2397be795a5c9151cbc91ee05da Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 25 Jul 2024 12:57:45 +0000 Subject: [PATCH 28/49] remove --export=ALL from 'native' --- workflow/rocoto/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index 67c1cff39e..60872bac65 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -224,9 +224,9 @@ def get_resource(self, task_name): native += ':shared' elif scheduler in ['slurm']: if task_config.get('is_exclusive', False): -- native = '--exclusive' -- else: -- native = '--export=NONE' + native = '--exclusive' + else: + native = '--export=NONE' if task_config['RESERVATION'] != "": native += '' if task_name in Tasks.SERVICE_TASKS else ' --reservation=' + task_config['RESERVATION'] if task_config.get('CLUSTERS', "") not in ["", '@CLUSTERS@']: From 01a892878c1310e79ecfcfb672de4916ea6aaeea Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 25 Jul 2024 19:41:05 +0000 Subject: [PATCH 29/49] add py-f90nml to noaacloud modulefile --- modulefiles/module_base.noaacloud.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua index 113409e41d..7997b618e4 100644 --- a/modulefiles/module_base.noaacloud.lua +++ b/modulefiles/module_base.noaacloud.lua @@ -26,6 +26,7 @@ load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None"))) load(pathJoin("crtm", (os.getenv("crtm_ver") or "None"))) load(pathJoin("bufr", (os.getenv("bufr_ver") or "None"))) load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None"))) +load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None"))) load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None"))) load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None"))) load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None"))) From b035947ff0db984fd7adcf78411f95983e554585 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 25 Jul 2024 19:56:27 +0000 Subject: [PATCH 30/49] remove un-necessary added lines --- workflow/rocoto/gefs_tasks.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index b1031d3335..da1277b787 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -11,10 +11,6 @@ def __init__(self, app_config: AppConfig, cdump: str) -> None: def stage_ic(self): cpl_ic = self._configs['stage_ic'] - #The if block below is added for AWS. - #If we have a proper way to define 'BASE_CPLIC', this if block can be removed. - if ('BASE_CPLIC' not in cpl_ic.keys()): - cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/contrib/Wei.Huang/data/ICDIRS/prototype_ICs') deps = [] dtg_prefix = "@Y@m@d.@H0000" offset = str(self._configs['base']['OFFSET_START_HOUR']).zfill(2) + ":00:00" From bf3b460ef63704e9712bd9a6a44d7bb13cb790ff Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 25 Jul 2024 19:59:59 +0000 Subject: [PATCH 31/49] remove un-necessary added lines --- workflow/rocoto/gefs_tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index da1277b787..882c6e1484 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -2,7 +2,6 @@ from rocoto.tasks import Tasks import rocoto.rocoto as rocoto from datetime import datetime, timedelta -import os class GEFSTasks(Tasks): From 47627ff023df7ef139ee03fb5dc910432583fd25 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 26 Jul 2024 16:51:31 +0000 Subject: [PATCH 32/49] remove added lines which was originally for AWS, but should be defined in hosts/awspw.yaml --- workflow/rocoto/gfs_tasks.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/workflow/rocoto/gfs_tasks.py b/workflow/rocoto/gfs_tasks.py index a237955a4b..1f14751642 100644 --- a/workflow/rocoto/gfs_tasks.py +++ b/workflow/rocoto/gfs_tasks.py @@ -3,7 +3,7 @@ from wxflow import timedelta_to_HMS import rocoto.rocoto as rocoto import numpy as np -import os + class GFSTasks(Tasks): @@ -24,12 +24,6 @@ def stage_ic(self): # Atm ICs if self.app_config.do_atm: - #The if block below is added for AWS. - #If we have a proper way to define 'BASE_CPLIC', this if block can be removed. - if ('BASE_CPLIC' not in cpl_ic.keys()): - cpl_ic['BASE_CPLIC'] = os.environ.get('BASE_CPLIC', '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs') - if ('CPL_ATMIC' not in cpl_ic.keys()): - cpl_ic['CPL_ATMIC'] = os.environ.get('CPL_ATMIC', 'workflow_C48_refactored') prefix = f"{cpl_ic['BASE_CPLIC']}/{cpl_ic['CPL_ATMIC']}/@Y@m@d@H/atmos" for file in ['gfs_ctrl.nc'] + \ [f'{datatype}_data.tile{tile}.nc' From 7bf89004bb9be2e4b600dfe59398822054b72ff4 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Fri, 26 Jul 2024 16:56:59 +0000 Subject: [PATCH 33/49] restore as develop --- workflow/rocoto/gefs_tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/rocoto/gefs_tasks.py b/workflow/rocoto/gefs_tasks.py index 882c6e1484..f6d1ad6cdc 100644 --- a/workflow/rocoto/gefs_tasks.py +++ b/workflow/rocoto/gefs_tasks.py @@ -3,6 +3,7 @@ import rocoto.rocoto as rocoto from datetime import datetime, timedelta + class GEFSTasks(Tasks): def __init__(self, app_config: AppConfig, cdump: str) -> None: From 0685a8f8230ae168744d36a656906e20d8c1fc9f Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Mon, 29 Jul 2024 13:47:53 +0000 Subject: [PATCH 34/49] try to fix pynorms error --- env/AWSPW.env | 244 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 224 insertions(+), 20 deletions(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index 751f52db41..f9158854bc 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -9,17 +9,205 @@ fi step=$1 -export launcher="srun --mpi=pmi2 -l" -export mpmd_opt="" +export launcher="srun -l --export=ALL" +export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out" + +# export POSTAMBLE_CMD='report-mem' # Configure MPI environment export OMP_STACKSIZE=2048000 export NTHSTACK=1024000000 -ulimit -s unlimited -ulimit -a +# Setting stacksize to unlimited on login nodes is prohibited +if [[ -n "${SLURM_JOB_ID:-}" ]]; then + ulimit -s unlimited + ulimit -a +fi + +if [[ "${step}" = "prep" ]] || [[ "${step}" = "prepbufr" ]]; then + + nth_max=$((npe_node_max / npe_node_prep)) + + export POE="NO" + export BACK="NO" + export sys_tp="HERA" + export launcher_PREP="srun" + +elif [[ "${step}" = "prepsnowobs" ]]; then + + export APRUN_CALCFIMS="${launcher} -n 1" + +elif [[ "${step}" = "prep_emissions" ]]; then + + export APRUN="${launcher} -n 1" + +elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || [[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostbndpntbll" ]] || [[ "${step}" = "wavepostpnt" ]]; then + + export CFP_MP="YES" + if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi + export wavempexec=${launcher} + export wave_mpmd=${mpmd_opt} + +elif [[ "${step}" = "atmanlvar" ]]; then + + nth_max=$((npe_node_max / npe_node_atmanlvar)) + + export NTHREADS_ATMANLVAR=${nth_atmanlvar:-${nth_max}} + [[ ${NTHREADS_ATMANLVAR} -gt ${nth_max} ]] && export NTHREADS_ATMANLVAR=${nth_max} + export APRUN_ATMANLVAR="${launcher} -n ${npe_atmanlvar} --cpus-per-task=${NTHREADS_ATMANLVAR}" + +elif [[ "${step}" = "atmensanlletkf" ]]; then + + nth_max=$((npe_node_max / npe_node_atmensanlletkf)) + + export NTHREADS_ATMENSANLLETKF=${nth_atmensanlletkf:-${nth_max}} + [[ ${NTHREADS_ATMENSANLLETKF} -gt ${nth_max} ]] && export NTHREADS_ATMENSANLLETKF=${nth_max} + export APRUN_ATMENSANLLETKF="${launcher} -n ${npe_atmensanlletkf} --cpus-per-task=${NTHREADS_ATMENSANLLETKF}" + +elif [[ "${step}" = "atmensanlfv3inc" ]]; then + + nth_max=$((npe_node_max / npe_node_atmensanlfv3inc)) + + export NTHREADS_ATMENSANLFV3INC=${nth_atmensanlfv3inc:-${nth_max}} + [[ ${NTHREADS_ATMENSANLFV3INC} -gt ${nth_max} ]] && export NTHREADS_ATMENSANLFV3INC=${nth_max} + export APRUN_ATMENSANLFV3INC="${launcher} -n ${npe_atmensanlfv3inc} --cpus-per-task=${NTHREADS_ATMENSANLFV3INC}" + +elif [[ "${step}" = "aeroanlrun" ]]; then + + export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" + + nth_max=$((npe_node_max / npe_node_aeroanlrun)) + + export NTHREADS_AEROANL=${nth_aeroanlrun:-${nth_max}} + [[ ${NTHREADS_AEROANL} -gt ${nth_max} ]] && export NTHREADS_AEROANL=${nth_max} + export APRUN_AEROANL="${launcher} -n ${npe_aeroanlrun} --cpus-per-task=${NTHREADS_AEROANL}" + +elif [[ "${step}" = "atmanlfv3inc" ]]; then + + nth_max=$((npe_node_max / npe_node_atmanlfv3inc)) + + export NTHREADS_ATMANLFV3INC=${nth_atmanlfv3inc:-${nth_max}} + [[ ${NTHREADS_ATMANLFV3INC} -gt ${nth_max} ]] && export NTHREADS_ATMANLFV3INC=${nth_max} + export APRUN_ATMANLFV3INC="${launcher} -n ${npe_atmanlfv3inc} --cpus-per-task=${NTHREADS_ATMANLFV3INC}" + +elif [[ "${step}" = "prepobsaero" ]]; then + + nth_max=$((npe_node_max / npe_node_prepobsaero)) + + export NTHREADS_PREPOBSAERO=${nth_prepobsaero:-1} + export APRUN_PREPOBSAERO="${launcher} -n ${npe_prepobsaero} --cpus-per-task=${NTHREADS_PREPOBSAERO}" + +elif [[ "${step}" = "snowanl" ]]; then + + nth_max=$((npe_node_max / npe_node_snowanl)) + + export NTHREADS_SNOWANL=${nth_snowanl:-${nth_max}} + [[ ${NTHREADS_SNOWANL} -gt ${nth_max} ]] && export NTHREADS_SNOWANL=${nth_max} + export APRUN_SNOWANL="${launcher} -n ${npe_snowanl} --cpus-per-task=${NTHREADS_SNOWANL}" + + export APRUN_APPLY_INCR="${launcher} -n 6" + +elif [[ "${step}" = "ocnanalbmat" ]]; then + + export APRUNCFP="${launcher} -n \$ncmd --multi-prog" + + export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalbmat}" + +elif [[ "${step}" = "ocnanalrun" ]]; then + + export APRUNCFP="${launcher} -n \$ncmd --multi-prog" + + export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalrun}" + +elif [[ "${step}" = "ocnanalchkpt" ]]; then -if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then + export APRUNCFP="${launcher} -n \$ncmd --multi-prog" + + export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalchkpt}" + +elif [[ "${step}" = "ocnanalecen" ]]; then + + nth_max=$((npe_node_max / npe_node_ocnanalecen)) + + export NTHREADS_OCNANALECEN=${nth_ocnanalecen:-${nth_max}} + [[ ${NTHREADS_OCNANALECEN} -gt ${nth_max} ]] && export NTHREADS_OCNANALECEN=${nth_max} + export APRUN_OCNANALECEN="${launcher} -n ${npe_ocnanalecen} --cpus-per-task=${NTHREADS_OCNANALECEN}" + +elif [[ "${step}" = "marineanalletkf" ]]; then + + nth_max=$((npe_node_max / npe_node_marineanalletkf)) + + export NTHREADS_MARINEANALLETKF=${nth_marineanalletkf:-${nth_max}} + [[ ${NTHREADS_MARINEANALLETKF} -gt ${nth_max} ]] && export NTHREADS_MARINEANALLETKF=${nth_max} + export APRUN_MARINEANALLETKF="${launcher} -n ${npe_marineanalletkf} --cpus-per-task=${NTHREADS_MARINEANALLETKF}" + +elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then + + export MKL_NUM_THREADS=4 + export MKL_CBWR=AUTO + + export CFP_MP=${CFP_MP:-"YES"} + export USE_CFP=${USE_CFP:-"YES"} + export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" + + nth_max=$((npe_node_max / npe_node_anal)) + + export NTHREADS_GSI=${nth_anal:-${nth_max}} + [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} + export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task=${NTHREADS_GSI}" + + export NTHREADS_CALCINC=${nth_calcinc:-1} + [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} + export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task=${NTHREADS_CALCINC}" + + export NTHREADS_CYCLE=${nth_cycle:-12} + [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} + npe_cycle=${ntiles:-6} + export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task=${NTHREADS_CYCLE}" + + export NTHREADS_GAUSFCANL=1 + npe_gausfcanl=${npe_gausfcanl:-1} + export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task=${NTHREADS_GAUSFCANL}" + +elif [[ "${step}" = "sfcanl" ]]; then + + nth_max=$((npe_node_max / npe_node_sfcanl)) + + export NTHREADS_CYCLE=${nth_sfcanl:-14} + [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} + npe_sfcanl=${ntiles:-6} + export APRUN_CYCLE="${launcher} -n ${npe_sfcanl} --cpus-per-task=${NTHREADS_CYCLE}" + +elif [[ "${step}" = "eobs" ]]; then + + export MKL_NUM_THREADS=4 + export MKL_CBWR=AUTO + + nth_max=$((npe_node_max / npe_node_eobs)) + + export NTHREADS_GSI=${nth_eobs:-${nth_max}} + [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} + export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_eobs}} --cpus-per-task=${NTHREADS_GSI}" + + export CFP_MP=${CFP_MP:-"YES"} + export USE_CFP=${USE_CFP:-"YES"} + export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" + +elif [[ "${step}" = "eupd" ]]; then + + nth_max=$((npe_node_max / npe_node_eupd)) + + export NTHREADS_ENKF=${nth_eupd:-${nth_max}} + [[ ${NTHREADS_ENKF} -gt ${nth_max} ]] && export NTHREADS_ENKF=${nth_max} + export APRUN_ENKF="${launcher} -n ${npe_enkf:-${npe_eupd}} --cpus-per-task=${NTHREADS_ENKF}" + + export CFP_MP=${CFP_MP:-"YES"} + export USE_CFP=${USE_CFP:-"YES"} + export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" + +elif [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then + + export launcher="srun --mpi=pmi2 -l" ppn="npe_node_${step}_${RUN}" [[ -z "${!ppn+0}" ]] && ppn="npe_node_${step}" @@ -32,17 +220,25 @@ if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then export APRUN_UFS="${launcher} -n ${ntasks}" unset nprocs ppn nnodes ntasks -elif [[ "${step}" = "post" ]]; then - nth_max=$((npe_node_max / npe_node_post)) +elif [[ "${step}" = "upp" ]]; then + + nth_max=$((npe_node_max / npe_node_upp)) + + export NTHREADS_UPP=${nth_upp:-1} + [[ ${NTHREADS_UPP} -gt ${nth_max} ]] && export NTHREADS_UPP=${nth_max} + export APRUN_UPP="${launcher} -n ${npe_upp} --cpus-per-task=${NTHREADS_UPP}" + +elif [[ "${step}" = "atmos_products" ]]; then + + export USE_CFP="YES" # Use MPMD for downstream product generation on Hera - export NTHREADS_NP=${nth_np:-1} - [[ ${NTHREADS_NP} -gt ${nth_max} ]] && export NTHREADS_NP=${nth_max} - export APRUN_NP="${launcher} -n ${npe_post}" +elif [[ "${step}" = "oceanice_products" ]]; then - export NTHREADS_DWN=${nth_dwn:-1} - [[ ${NTHREADS_DWN} -gt ${nth_max} ]] && export NTHREADS_DWN=${nth_max} - export APRUN_DWN="${launcher} -n ${npe_dwn}" + nth_max=$((npe_node_max / npe_node_oceanice_products)) + + export NTHREADS_OCNICEPOST=${nth_oceanice_products:-1} + export APRUN_OCNICEPOST="${launcher} -n 1 --cpus-per-task=${NTHREADS_OCNICEPOST}" elif [[ "${step}" = "ecen" ]]; then @@ -50,7 +246,7 @@ elif [[ "${step}" = "ecen" ]]; then export NTHREADS_ECEN=${nth_ecen:-${nth_max}} [[ ${NTHREADS_ECEN} -gt ${nth_max} ]] && export NTHREADS_ECEN=${nth_max} - export APRUN_ECEN="${launcher} -n ${npe_ecen}" + export APRUN_ECEN="${launcher} -n ${npe_ecen} --cpus-per-task=${NTHREADS_ECEN}" export NTHREADS_CHGRES=${nth_chgres:-12} [[ ${NTHREADS_CHGRES} -gt ${npe_node_max} ]] && export NTHREADS_CHGRES=${npe_node_max} @@ -58,7 +254,7 @@ elif [[ "${step}" = "ecen" ]]; then export NTHREADS_CALCINC=${nth_calcinc:-1} [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} - export APRUN_CALCINC="${launcher} -n ${npe_ecen}" + export APRUN_CALCINC="${launcher} -n ${npe_ecen} --cpus-per-task=${NTHREADS_CALCINC}" elif [[ "${step}" = "esfc" ]]; then @@ -66,11 +262,11 @@ elif [[ "${step}" = "esfc" ]]; then export NTHREADS_ESFC=${nth_esfc:-${nth_max}} [[ ${NTHREADS_ESFC} -gt ${nth_max} ]] && export NTHREADS_ESFC=${nth_max} - export APRUN_ESFC="${launcher} -n ${npe_esfc}" + export APRUN_ESFC="${launcher} -n ${npe_esfc} --cpus-per-task=${NTHREADS_ESFC}" export NTHREADS_CYCLE=${nth_cycle:-14} [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} - export APRUN_CYCLE="${launcher} -n ${npe_esfc}" + export APRUN_CYCLE="${launcher} -n ${npe_esfc} --cpus-per-task=${NTHREADS_CYCLE}" elif [[ "${step}" = "epos" ]]; then @@ -78,7 +274,7 @@ elif [[ "${step}" = "epos" ]]; then export NTHREADS_EPOS=${nth_epos:-${nth_max}} [[ ${NTHREADS_EPOS} -gt ${nth_max} ]] && export NTHREADS_EPOS=${nth_max} - export APRUN_EPOS="${launcher} -n ${npe_epos}" + export APRUN_EPOS="${launcher} -n ${npe_epos} --cpus-per-task=${NTHREADS_EPOS}" elif [[ "${step}" = "postsnd" ]]; then @@ -88,7 +284,7 @@ elif [[ "${step}" = "postsnd" ]]; then export NTHREADS_POSTSND=${nth_postsnd:-1} [[ ${NTHREADS_POSTSND} -gt ${nth_max} ]] && export NTHREADS_POSTSND=${nth_max} - export APRUN_POSTSND="${launcher} -n ${npe_postsnd}" + export APRUN_POSTSND="${launcher} -n ${npe_postsnd} --cpus-per-task=${NTHREADS_POSTSND}" export NTHREADS_POSTSNDCFP=${nth_postsndcfp:-1} [[ ${NTHREADS_POSTSNDCFP} -gt ${nth_max} ]] && export NTHREADS_POSTSNDCFP=${nth_max} @@ -102,6 +298,14 @@ elif [[ "${step}" = "awips" ]]; then [[ ${NTHREADS_AWIPS} -gt ${nth_max} ]] && export NTHREADS_AWIPS=${nth_max} export APRUN_AWIPSCFP="${launcher} -n ${npe_awips} ${mpmd_opt}" +elif [[ "${step}" = "gempak" ]]; then + + export CFP_MP="YES" + + nth_max=$((npe_node_max / npe_node_gempak)) + + export NTHREADS_GEMPAK=${nth_gempak:-1} + [[ ${NTHREADS_GEMPAK} -gt ${nth_max} ]] && export NTHREADS_GEMPAK=${nth_max} elif [[ "${step}" = "fit2obs" ]]; then @@ -109,6 +313,6 @@ elif [[ "${step}" = "fit2obs" ]]; then export NTHREADS_FIT2OBS=${nth_fit2obs:-1} [[ ${NTHREADS_FIT2OBS} -gt ${nth_max} ]] && export NTHREADS_FIT2OBS=${nth_max} - export MPIRUN="${launcher} -n ${npe_fit2obs}" + export MPIRUN="${launcher} -n ${npe_fit2obs} --cpus-per-task=${NTHREADS_FIT2OBS}" fi From 2c52016e9a6e10e44fedd25aa26f2847a9127cd5 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 30 Jul 2024 14:42:29 +0000 Subject: [PATCH 35/49] sync with EMC repo --- sorc/link_workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index 432c3f11d7..acc2dc9cc6 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -75,7 +75,7 @@ case "${machine}" in "hercules") FIX_DIR="/work/noaa/global/kfriedma/glopara/fix" ;; "jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;; "s4") FIX_DIR="/data/prod/glopara/fix" ;; - "gaea") FIX_DIR="/gpfs/f5/epic/proj-shared/global/glopara/data/fix" ;; + "gaea") FIX_DIR="/gpfs/f5/ufs-ard/world-shared/global/glopara/data/fix" ;; "noaacloud") FIX_DIR="/contrib/global-workflow-shared-data/fix" ;; *) echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR" From cd6c541b09f240c8d2eb24c31642f1fe928f4777 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 30 Jul 2024 15:05:21 +0000 Subject: [PATCH 36/49] sync Gaea link with EMC repo, and only include blocks/packs that run on AWS --- env/AWSPW.env | 263 ------------------------------------------ sorc/link_workflow.sh | 2 +- 2 files changed, 1 insertion(+), 264 deletions(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index f9158854bc..96f97b61da 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -33,178 +33,6 @@ if [[ "${step}" = "prep" ]] || [[ "${step}" = "prepbufr" ]]; then export sys_tp="HERA" export launcher_PREP="srun" -elif [[ "${step}" = "prepsnowobs" ]]; then - - export APRUN_CALCFIMS="${launcher} -n 1" - -elif [[ "${step}" = "prep_emissions" ]]; then - - export APRUN="${launcher} -n 1" - -elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || [[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostbndpntbll" ]] || [[ "${step}" = "wavepostpnt" ]]; then - - export CFP_MP="YES" - if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi - export wavempexec=${launcher} - export wave_mpmd=${mpmd_opt} - -elif [[ "${step}" = "atmanlvar" ]]; then - - nth_max=$((npe_node_max / npe_node_atmanlvar)) - - export NTHREADS_ATMANLVAR=${nth_atmanlvar:-${nth_max}} - [[ ${NTHREADS_ATMANLVAR} -gt ${nth_max} ]] && export NTHREADS_ATMANLVAR=${nth_max} - export APRUN_ATMANLVAR="${launcher} -n ${npe_atmanlvar} --cpus-per-task=${NTHREADS_ATMANLVAR}" - -elif [[ "${step}" = "atmensanlletkf" ]]; then - - nth_max=$((npe_node_max / npe_node_atmensanlletkf)) - - export NTHREADS_ATMENSANLLETKF=${nth_atmensanlletkf:-${nth_max}} - [[ ${NTHREADS_ATMENSANLLETKF} -gt ${nth_max} ]] && export NTHREADS_ATMENSANLLETKF=${nth_max} - export APRUN_ATMENSANLLETKF="${launcher} -n ${npe_atmensanlletkf} --cpus-per-task=${NTHREADS_ATMENSANLLETKF}" - -elif [[ "${step}" = "atmensanlfv3inc" ]]; then - - nth_max=$((npe_node_max / npe_node_atmensanlfv3inc)) - - export NTHREADS_ATMENSANLFV3INC=${nth_atmensanlfv3inc:-${nth_max}} - [[ ${NTHREADS_ATMENSANLFV3INC} -gt ${nth_max} ]] && export NTHREADS_ATMENSANLFV3INC=${nth_max} - export APRUN_ATMENSANLFV3INC="${launcher} -n ${npe_atmensanlfv3inc} --cpus-per-task=${NTHREADS_ATMENSANLFV3INC}" - -elif [[ "${step}" = "aeroanlrun" ]]; then - - export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" - - nth_max=$((npe_node_max / npe_node_aeroanlrun)) - - export NTHREADS_AEROANL=${nth_aeroanlrun:-${nth_max}} - [[ ${NTHREADS_AEROANL} -gt ${nth_max} ]] && export NTHREADS_AEROANL=${nth_max} - export APRUN_AEROANL="${launcher} -n ${npe_aeroanlrun} --cpus-per-task=${NTHREADS_AEROANL}" - -elif [[ "${step}" = "atmanlfv3inc" ]]; then - - nth_max=$((npe_node_max / npe_node_atmanlfv3inc)) - - export NTHREADS_ATMANLFV3INC=${nth_atmanlfv3inc:-${nth_max}} - [[ ${NTHREADS_ATMANLFV3INC} -gt ${nth_max} ]] && export NTHREADS_ATMANLFV3INC=${nth_max} - export APRUN_ATMANLFV3INC="${launcher} -n ${npe_atmanlfv3inc} --cpus-per-task=${NTHREADS_ATMANLFV3INC}" - -elif [[ "${step}" = "prepobsaero" ]]; then - - nth_max=$((npe_node_max / npe_node_prepobsaero)) - - export NTHREADS_PREPOBSAERO=${nth_prepobsaero:-1} - export APRUN_PREPOBSAERO="${launcher} -n ${npe_prepobsaero} --cpus-per-task=${NTHREADS_PREPOBSAERO}" - -elif [[ "${step}" = "snowanl" ]]; then - - nth_max=$((npe_node_max / npe_node_snowanl)) - - export NTHREADS_SNOWANL=${nth_snowanl:-${nth_max}} - [[ ${NTHREADS_SNOWANL} -gt ${nth_max} ]] && export NTHREADS_SNOWANL=${nth_max} - export APRUN_SNOWANL="${launcher} -n ${npe_snowanl} --cpus-per-task=${NTHREADS_SNOWANL}" - - export APRUN_APPLY_INCR="${launcher} -n 6" - -elif [[ "${step}" = "ocnanalbmat" ]]; then - - export APRUNCFP="${launcher} -n \$ncmd --multi-prog" - - export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalbmat}" - -elif [[ "${step}" = "ocnanalrun" ]]; then - - export APRUNCFP="${launcher} -n \$ncmd --multi-prog" - - export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalrun}" - -elif [[ "${step}" = "ocnanalchkpt" ]]; then - - export APRUNCFP="${launcher} -n \$ncmd --multi-prog" - - export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalchkpt}" - -elif [[ "${step}" = "ocnanalecen" ]]; then - - nth_max=$((npe_node_max / npe_node_ocnanalecen)) - - export NTHREADS_OCNANALECEN=${nth_ocnanalecen:-${nth_max}} - [[ ${NTHREADS_OCNANALECEN} -gt ${nth_max} ]] && export NTHREADS_OCNANALECEN=${nth_max} - export APRUN_OCNANALECEN="${launcher} -n ${npe_ocnanalecen} --cpus-per-task=${NTHREADS_OCNANALECEN}" - -elif [[ "${step}" = "marineanalletkf" ]]; then - - nth_max=$((npe_node_max / npe_node_marineanalletkf)) - - export NTHREADS_MARINEANALLETKF=${nth_marineanalletkf:-${nth_max}} - [[ ${NTHREADS_MARINEANALLETKF} -gt ${nth_max} ]] && export NTHREADS_MARINEANALLETKF=${nth_max} - export APRUN_MARINEANALLETKF="${launcher} -n ${npe_marineanalletkf} --cpus-per-task=${NTHREADS_MARINEANALLETKF}" - -elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then - - export MKL_NUM_THREADS=4 - export MKL_CBWR=AUTO - - export CFP_MP=${CFP_MP:-"YES"} - export USE_CFP=${USE_CFP:-"YES"} - export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" - - nth_max=$((npe_node_max / npe_node_anal)) - - export NTHREADS_GSI=${nth_anal:-${nth_max}} - [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} - export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task=${NTHREADS_GSI}" - - export NTHREADS_CALCINC=${nth_calcinc:-1} - [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} - export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task=${NTHREADS_CALCINC}" - - export NTHREADS_CYCLE=${nth_cycle:-12} - [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} - npe_cycle=${ntiles:-6} - export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task=${NTHREADS_CYCLE}" - - export NTHREADS_GAUSFCANL=1 - npe_gausfcanl=${npe_gausfcanl:-1} - export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task=${NTHREADS_GAUSFCANL}" - -elif [[ "${step}" = "sfcanl" ]]; then - - nth_max=$((npe_node_max / npe_node_sfcanl)) - - export NTHREADS_CYCLE=${nth_sfcanl:-14} - [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} - npe_sfcanl=${ntiles:-6} - export APRUN_CYCLE="${launcher} -n ${npe_sfcanl} --cpus-per-task=${NTHREADS_CYCLE}" - -elif [[ "${step}" = "eobs" ]]; then - - export MKL_NUM_THREADS=4 - export MKL_CBWR=AUTO - - nth_max=$((npe_node_max / npe_node_eobs)) - - export NTHREADS_GSI=${nth_eobs:-${nth_max}} - [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} - export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_eobs}} --cpus-per-task=${NTHREADS_GSI}" - - export CFP_MP=${CFP_MP:-"YES"} - export USE_CFP=${USE_CFP:-"YES"} - export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" - -elif [[ "${step}" = "eupd" ]]; then - - nth_max=$((npe_node_max / npe_node_eupd)) - - export NTHREADS_ENKF=${nth_eupd:-${nth_max}} - [[ ${NTHREADS_ENKF} -gt ${nth_max} ]] && export NTHREADS_ENKF=${nth_max} - export APRUN_ENKF="${launcher} -n ${npe_enkf:-${npe_eupd}} --cpus-per-task=${NTHREADS_ENKF}" - - export CFP_MP=${CFP_MP:-"YES"} - export USE_CFP=${USE_CFP:-"YES"} - export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}" - elif [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then export launcher="srun --mpi=pmi2 -l" @@ -220,99 +48,8 @@ elif [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then export APRUN_UFS="${launcher} -n ${ntasks}" unset nprocs ppn nnodes ntasks - -elif [[ "${step}" = "upp" ]]; then - - nth_max=$((npe_node_max / npe_node_upp)) - - export NTHREADS_UPP=${nth_upp:-1} - [[ ${NTHREADS_UPP} -gt ${nth_max} ]] && export NTHREADS_UPP=${nth_max} - export APRUN_UPP="${launcher} -n ${npe_upp} --cpus-per-task=${NTHREADS_UPP}" - elif [[ "${step}" = "atmos_products" ]]; then export USE_CFP="YES" # Use MPMD for downstream product generation on Hera -elif [[ "${step}" = "oceanice_products" ]]; then - - nth_max=$((npe_node_max / npe_node_oceanice_products)) - - export NTHREADS_OCNICEPOST=${nth_oceanice_products:-1} - export APRUN_OCNICEPOST="${launcher} -n 1 --cpus-per-task=${NTHREADS_OCNICEPOST}" - -elif [[ "${step}" = "ecen" ]]; then - - nth_max=$((npe_node_max / npe_node_ecen)) - - export NTHREADS_ECEN=${nth_ecen:-${nth_max}} - [[ ${NTHREADS_ECEN} -gt ${nth_max} ]] && export NTHREADS_ECEN=${nth_max} - export APRUN_ECEN="${launcher} -n ${npe_ecen} --cpus-per-task=${NTHREADS_ECEN}" - - export NTHREADS_CHGRES=${nth_chgres:-12} - [[ ${NTHREADS_CHGRES} -gt ${npe_node_max} ]] && export NTHREADS_CHGRES=${npe_node_max} - export APRUN_CHGRES="time" - - export NTHREADS_CALCINC=${nth_calcinc:-1} - [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} - export APRUN_CALCINC="${launcher} -n ${npe_ecen} --cpus-per-task=${NTHREADS_CALCINC}" - -elif [[ "${step}" = "esfc" ]]; then - - nth_max=$((npe_node_max / npe_node_esfc)) - - export NTHREADS_ESFC=${nth_esfc:-${nth_max}} - [[ ${NTHREADS_ESFC} -gt ${nth_max} ]] && export NTHREADS_ESFC=${nth_max} - export APRUN_ESFC="${launcher} -n ${npe_esfc} --cpus-per-task=${NTHREADS_ESFC}" - - export NTHREADS_CYCLE=${nth_cycle:-14} - [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} - export APRUN_CYCLE="${launcher} -n ${npe_esfc} --cpus-per-task=${NTHREADS_CYCLE}" - -elif [[ "${step}" = "epos" ]]; then - - nth_max=$((npe_node_max / npe_node_epos)) - - export NTHREADS_EPOS=${nth_epos:-${nth_max}} - [[ ${NTHREADS_EPOS} -gt ${nth_max} ]] && export NTHREADS_EPOS=${nth_max} - export APRUN_EPOS="${launcher} -n ${npe_epos} --cpus-per-task=${NTHREADS_EPOS}" - -elif [[ "${step}" = "postsnd" ]]; then - - export CFP_MP="YES" - - nth_max=$((npe_node_max / npe_node_postsnd)) - - export NTHREADS_POSTSND=${nth_postsnd:-1} - [[ ${NTHREADS_POSTSND} -gt ${nth_max} ]] && export NTHREADS_POSTSND=${nth_max} - export APRUN_POSTSND="${launcher} -n ${npe_postsnd} --cpus-per-task=${NTHREADS_POSTSND}" - - export NTHREADS_POSTSNDCFP=${nth_postsndcfp:-1} - [[ ${NTHREADS_POSTSNDCFP} -gt ${nth_max} ]] && export NTHREADS_POSTSNDCFP=${nth_max} - export APRUN_POSTSNDCFP="${launcher} -n ${npe_postsndcfp} ${mpmd_opt}" - -elif [[ "${step}" = "awips" ]]; then - - nth_max=$((npe_node_max / npe_node_awips)) - - export NTHREADS_AWIPS=${nth_awips:-2} - [[ ${NTHREADS_AWIPS} -gt ${nth_max} ]] && export NTHREADS_AWIPS=${nth_max} - export APRUN_AWIPSCFP="${launcher} -n ${npe_awips} ${mpmd_opt}" - -elif [[ "${step}" = "gempak" ]]; then - - export CFP_MP="YES" - - nth_max=$((npe_node_max / npe_node_gempak)) - - export NTHREADS_GEMPAK=${nth_gempak:-1} - [[ ${NTHREADS_GEMPAK} -gt ${nth_max} ]] && export NTHREADS_GEMPAK=${nth_max} - -elif [[ "${step}" = "fit2obs" ]]; then - - nth_max=$((npe_node_max / npe_node_fit2obs)) - - export NTHREADS_FIT2OBS=${nth_fit2obs:-1} - [[ ${NTHREADS_FIT2OBS} -gt ${nth_max} ]] && export NTHREADS_FIT2OBS=${nth_max} - export MPIRUN="${launcher} -n ${npe_fit2obs} --cpus-per-task=${NTHREADS_FIT2OBS}" - fi diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index 432c3f11d7..acc2dc9cc6 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -75,7 +75,7 @@ case "${machine}" in "hercules") FIX_DIR="/work/noaa/global/kfriedma/glopara/fix" ;; "jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;; "s4") FIX_DIR="/data/prod/glopara/fix" ;; - "gaea") FIX_DIR="/gpfs/f5/epic/proj-shared/global/glopara/data/fix" ;; + "gaea") FIX_DIR="/gpfs/f5/ufs-ard/world-shared/global/glopara/data/fix" ;; "noaacloud") FIX_DIR="/contrib/global-workflow-shared-data/fix" ;; *) echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR" From fe9a4571bf0d2cd8fc47254c9ae094d8d11b37a6 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 31 Jul 2024 15:50:07 +0000 Subject: [PATCH 37/49] Remove ACCOUNT_SERVICE --- workflow/hosts/awspw.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index 81678004dc..f925f54008 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -5,9 +5,8 @@ COMINsyn: '' #TODO: This does not yet exist. HOMEDIR: '/contrib/${USER}' STMP: '/lustre/${USER}/stmp/' PTMP: '/lustre/${USER}/ptmp/' -NOSCRUB: ${HOMEDIR} -ACCOUNT: ${USER} -ACCOUNT_SERVICE: ${USER} +NOSCRUB: '${HOMEDIR}' +ACCOUNT: '${USER}' SCHEDULER: slurm QUEUE: batch QUEUE_SERVICE: batch From 6a4bada18e81f56d7013b3c186fc4a57f6778bed Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 1 Aug 2024 15:00:32 +0000 Subject: [PATCH 38/49] run gefs --- env/AWSPW.env | 20 ++++++++++++++++++++ parm/config/gefs/config.resources | 2 +- sorc/verif-global.fd | 2 +- sorc/wxflow | 2 +- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index 992281a1d7..9b03143f32 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -43,6 +43,17 @@ if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then export APRUN_UFS="${launcher} -n ${ufs_ntasks}" unset nnodes ufs_ntasks +elif [[ "${step}" = "prep_emissions" ]]; then + + export APRUN="${APRUN}" + +elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || [[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostbndpntbll" ]] || [[ "${step}" = "wavepostpnt" ]]; then + + export CFP_MP="YES" + if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi + export wavempexec=${launcher} + export wave_mpmd=${mpmd_opt} + elif [[ "${step}" = "post" ]]; then export NTHREADS_NP=${NTHREADS1} @@ -52,6 +63,15 @@ elif [[ "${step}" = "post" ]]; then [[ ${NTHREADS_DWN} -gt ${max_threads_per_task} ]] && export NTHREADS_DWN=${max_threads_per_task} export APRUN_DWN="${launcher} -n ${ntasks_dwn}" +elif [[ "${step}" = "atmos_products" ]]; then + + export USE_CFP="YES" # Use MPMD for downstream product generation on Hera + +elif [[ "${step}" = "oceanice_products" ]]; then + + export NTHREADS_OCNICEPOST=${NTHREADS1} + export APRUN_OCNICEPOST="${launcher} -n 1 --cpus-per-task=${NTHREADS_OCNICEPOST}" + elif [[ "${step}" = "ecen" ]]; then export NTHREADS_ECEN=${NTHREADSmax} diff --git a/parm/config/gefs/config.resources b/parm/config/gefs/config.resources index 81d2a20635..15e087b52f 100644 --- a/parm/config/gefs/config.resources +++ b/parm/config/gefs/config.resources @@ -41,7 +41,7 @@ case ${machine} in ;; "AWSPW") export PARTITION_BATCH="compute" - max_tasks_per_node=40 + max_tasks_per_node=36 ;; *) echo "FATAL ERROR: Unknown machine encountered by ${BASH_SOURCE[0]}" diff --git a/sorc/verif-global.fd b/sorc/verif-global.fd index 92904d2c43..0d9e0b6ab0 160000 --- a/sorc/verif-global.fd +++ b/sorc/verif-global.fd @@ -1 +1 @@ -Subproject commit 92904d2c431969345968f74e676717057ec0042a +Subproject commit 0d9e0b6ab0cabbaccbdfa0868a256065984777ee diff --git a/sorc/wxflow b/sorc/wxflow index d314e06510..e1ef697430 160000 --- a/sorc/wxflow +++ b/sorc/wxflow @@ -1 +1 @@ -Subproject commit d314e065101041a4d45e5a11ec19cd2dc5f38c67 +Subproject commit e1ef697430c09d2b1a0560f21f11c7a32ed5f3e2 From f5a03d47f555dff134ffd6b54d4439475a1b49f0 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 13 Aug 2024 18:45:09 +0000 Subject: [PATCH 39/49] make gefs run on AWS --- parm/config/gefs/config.resources.AWSPW | 11 +++++++++++ parm/config/gfs/config.resources.AWSPW | 1 + scripts/exgfs_wave_post_pnt.sh | 8 ++++---- workflow/rocoto/rocoto.py | 3 ++- 4 files changed, 18 insertions(+), 5 deletions(-) create mode 100644 parm/config/gefs/config.resources.AWSPW diff --git a/parm/config/gefs/config.resources.AWSPW b/parm/config/gefs/config.resources.AWSPW new file mode 100644 index 0000000000..2bb5f35e76 --- /dev/null +++ b/parm/config/gefs/config.resources.AWSPW @@ -0,0 +1,11 @@ +#! /usr/bin/env bash + +# AWS-specific job resources + +export is_exclusive="True" +export memory=None + +# shellcheck disable=SC2312 +for mem_var in $(env | grep '^memory_' | cut -d= -f1); do + unset "${mem_var}" +done diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW index 8649713bb7..2bb5f35e76 100644 --- a/parm/config/gfs/config.resources.AWSPW +++ b/parm/config/gfs/config.resources.AWSPW @@ -3,6 +3,7 @@ # AWS-specific job resources export is_exclusive="True" +export memory=None # shellcheck disable=SC2312 for mem_var in $(env | grep '^memory_' | cut -d= -f1); do diff --git a/scripts/exgfs_wave_post_pnt.sh b/scripts/exgfs_wave_post_pnt.sh index 0b8874f3fb..5cfff32dba 100755 --- a/scripts/exgfs_wave_post_pnt.sh +++ b/scripts/exgfs_wave_post_pnt.sh @@ -393,14 +393,14 @@ source "${USHgfs}/preamble.sh" then export dtspec=3600. # Construct the wave_outp_spec (spec) command to run on each buoy in buoy_lst.txt - sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_spec.sh \1 ${ymdh} spec ${escaped_SPECDATA} > ${escaped_SPECDATA}\/spec_\1.out 2>\&1/" buoy_lst.txt >> "tmpcmdfile.${FH3}" + sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_spec.sh \1 ${ymdh} spec ${escaped_SPECDATA} > ${escaped_SPECDATA}\/spec_\1.out 2>\&1?" buoy_lst.txt >> "tmpcmdfile.${FH3}" fi if [ "$DOBLL_WAV" = 'YES' ] then export dtspec=3600. # Construct the wave_outp_spec (bull) command to run on each buoy in buoy_lst.txt - sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_spec.sh \1 ${ymdh} bull ${escaped_SPECDATA} > ${escaped_SPECDATA}\/bull_\1.out 2>\&1/" buoy_lst.txt >> "tmpcmdfile.${FH3}" + sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_spec.sh \1 ${ymdh} bull ${escaped_SPECDATA} > ${escaped_SPECDATA}\/bull_\1.out 2>\&1?" buoy_lst.txt >> "tmpcmdfile.${FH3}" fi split -n l/1/10 tmpcmdfile.$FH3 > cmdfile.${FH3}.01 @@ -517,13 +517,13 @@ source "${USHgfs}/preamble.sh" if [ "$DOSPC_WAV" = 'YES' ] then # Construct wave_outp_cat (spec) call for each buoy in buoy_lst.txt - sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} spec > ${escaped_CATOUTDIR}\/spec_cat_\1.out 2>\&1/" buoy_lst.txt >> cmdfile.buoy + sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} spec > ${escaped_CATOUTDIR}\/spec_cat_\1.out 2>\&1?" buoy_lst.txt >> cmdfile.buoy fi if [ "$DOBLL_WAV" = 'YES' ] then # Construct wave_outp_cat (bull) call for each buoy in buoy_lst.txt - sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} bull > ${escaped_CATOUTDIR}\/bull_cat_\1.out 2>\&1/" buoy_lst.txt >> cmdfile.buoy + sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} bull > ${escaped_CATOUTDIR}\/bull_cat_\1.out 2>\&1?" buoy_lst.txt >> cmdfile.buoy fi if [ ${CFP_MP:-"NO"} = "YES" ]; then diff --git a/workflow/rocoto/rocoto.py b/workflow/rocoto/rocoto.py index 2a20820da8..a9b443aeda 100644 --- a/workflow/rocoto/rocoto.py +++ b/workflow/rocoto/rocoto.py @@ -146,7 +146,8 @@ def _create_innermost_task(task_dict: Dict[str, Any]) -> List[str]: strings.append(f'\t{walltime}\n') strings.append(f'\t{nodes}:ppn={ppn}:tpp={threads}\n') if memory is not None: - strings.append(f'\t{memory}\n') + if memory != 'None': + strings.append(f'\t{memory}\n') if native is not None: strings.append(f'\t{native}\n') strings.append('\n') From e606fc7bfef0c45e1b107fd0c81a038c1872183c Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 13 Aug 2024 22:11:29 +0000 Subject: [PATCH 40/49] test in gefs C48 case --- workflow/rocoto/workflow_xml.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index 8c859db25e..c049b32d20 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -156,9 +156,9 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: replyto = '' strings = ['', - f'#################### {pslot} ####################', - f'MAILTO="{replyto}"' - ] + f'#################### {pslot} ####################', + f'MAILTO="{replyto}"' + ] #AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']: strings.extend( @@ -166,11 +166,10 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: f'SHELL="/bin/bash"', f'BASH_ENV="/etc/bashrc"' ]) - strings.extend( - [ - f'{cronintstr} {rocotorunstr}', - '#################################################################', - '']) + strings.extend([ + f'{cronintstr} {rocotorunstr}', + '#################################################################', + '']) if crontab_file is None: crontab_file = f"{expdir}/{pslot}.crontab" From 8b3ad614cfb7f9d22608e0fbb9e96c92c891981d Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 13 Aug 2024 22:17:41 +0000 Subject: [PATCH 41/49] fix pynorms error --- workflow/rocoto/workflow_xml.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index c049b32d20..b697e349a9 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -159,13 +159,12 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: f'#################### {pslot} ####################', f'MAILTO="{replyto}"' ] - #AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. + # AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']: - strings.extend( - [ - f'SHELL="/bin/bash"', - f'BASH_ENV="/etc/bashrc"' - ]) + strings.extend([ + f'SHELL="/bin/bash"', + f'BASH_ENV="/etc/bashrc"' + ]) strings.extend([ f'{cronintstr} {rocotorunstr}', '#################################################################', From 73050a87433e009e4cc2950663207580afe65cd7 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 13 Aug 2024 22:24:02 +0000 Subject: [PATCH 42/49] fix pynorms error --- workflow/rocoto/workflow_xml.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index b697e349a9..d9ca4fb961 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -161,12 +161,9 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: ] # AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']: - strings.extend([ - f'SHELL="/bin/bash"', - f'BASH_ENV="/etc/bashrc"' - ]) - strings.extend([ - f'{cronintstr} {rocotorunstr}', + strings.extend([f'SHELL="/bin/bash"', + f'BASH_ENV="/etc/bashrc"']) + strings.extend([f'{cronintstr} {rocotorunstr}', '#################################################################', '']) From 10d4104013a94aa4da4833f0c3f8b6b4c60a063f Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 14 Aug 2024 19:58:00 +0000 Subject: [PATCH 43/49] use 'unset memory' in resource.AWSPW, instead of change it in rocoto.py --- parm/config/gefs/config.resources.AWSPW | 2 +- parm/config/gfs/config.resources.AWSPW | 2 +- workflow/rocoto/rocoto.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/parm/config/gefs/config.resources.AWSPW b/parm/config/gefs/config.resources.AWSPW index 2bb5f35e76..a735c7622d 100644 --- a/parm/config/gefs/config.resources.AWSPW +++ b/parm/config/gefs/config.resources.AWSPW @@ -3,7 +3,7 @@ # AWS-specific job resources export is_exclusive="True" -export memory=None +unset memory # shellcheck disable=SC2312 for mem_var in $(env | grep '^memory_' | cut -d= -f1); do diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW index 2bb5f35e76..a735c7622d 100644 --- a/parm/config/gfs/config.resources.AWSPW +++ b/parm/config/gfs/config.resources.AWSPW @@ -3,7 +3,7 @@ # AWS-specific job resources export is_exclusive="True" -export memory=None +unset memory # shellcheck disable=SC2312 for mem_var in $(env | grep '^memory_' | cut -d= -f1); do diff --git a/workflow/rocoto/rocoto.py b/workflow/rocoto/rocoto.py index a9b443aeda..2a20820da8 100644 --- a/workflow/rocoto/rocoto.py +++ b/workflow/rocoto/rocoto.py @@ -146,8 +146,7 @@ def _create_innermost_task(task_dict: Dict[str, Any]) -> List[str]: strings.append(f'\t{walltime}\n') strings.append(f'\t{nodes}:ppn={ppn}:tpp={threads}\n') if memory is not None: - if memory != 'None': - strings.append(f'\t{memory}\n') + strings.append(f'\t{memory}\n') if native is not None: strings.append(f'\t{native}\n') strings.append('\n') From b3de7c1c2c5392d3c3f5a0863123701d661b2f89 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 14 Aug 2024 21:06:52 +0000 Subject: [PATCH 44/49] revert hash --- sorc/verif-global.fd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/verif-global.fd b/sorc/verif-global.fd index 0d9e0b6ab0..92904d2c43 160000 --- a/sorc/verif-global.fd +++ b/sorc/verif-global.fd @@ -1 +1 @@ -Subproject commit 0d9e0b6ab0cabbaccbdfa0868a256065984777ee +Subproject commit 92904d2c431969345968f74e676717057ec0042a From 16770f87e1b38e4492f69649191ccb47b270d52c Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 14 Aug 2024 21:14:53 +0000 Subject: [PATCH 45/49] remove export APRUN="${APRUN}" --- env/AWSPW.env | 4 ---- 1 file changed, 4 deletions(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index 9b03143f32..a7ba9bb1c5 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -43,10 +43,6 @@ if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then export APRUN_UFS="${launcher} -n ${ufs_ntasks}" unset nnodes ufs_ntasks -elif [[ "${step}" = "prep_emissions" ]]; then - - export APRUN="${APRUN}" - elif [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || [[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostbndpntbll" ]] || [[ "${step}" = "wavepostpnt" ]]; then export CFP_MP="YES" From b9416fa5c3960ca43b68c24c31b32327944f7d4e Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 15 Aug 2024 20:47:25 +0000 Subject: [PATCH 46/49] revert sed separator change for now --- env/AWSPW.env | 2 +- scripts/exgfs_wave_post_pnt.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/env/AWSPW.env b/env/AWSPW.env index a7ba9bb1c5..7fe17d2492 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -27,7 +27,7 @@ if [[ -n "${ntasks:-}" && -n "${max_tasks_per_node:-}" && -n "${tasks_per_node:- NTHREADS1=${threads_per_task:-1} [[ ${NTHREADSmax} -gt ${max_threads_per_task} ]] && NTHREADSmax=${max_threads_per_task} [[ ${NTHREADS1} -gt ${max_threads_per_task} ]] && NTHREADS1=${max_threads_per_task} - APRUN="${launcher} -n ${ntasks}" + export APRUN="${launcher} -n ${ntasks}" else echo "ERROR config.resources must be sourced before sourcing AWSPW.env" exit 2 diff --git a/scripts/exgfs_wave_post_pnt.sh b/scripts/exgfs_wave_post_pnt.sh index 5cfff32dba..e592b9b7cb 100755 --- a/scripts/exgfs_wave_post_pnt.sh +++ b/scripts/exgfs_wave_post_pnt.sh @@ -517,13 +517,13 @@ source "${USHgfs}/preamble.sh" if [ "$DOSPC_WAV" = 'YES' ] then # Construct wave_outp_cat (spec) call for each buoy in buoy_lst.txt - sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} spec > ${escaped_CATOUTDIR}\/spec_cat_\1.out 2>\&1?" buoy_lst.txt >> cmdfile.buoy + sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} spec > ${escaped_CATOUTDIR}\/spec_cat_\1.out 2>\&1/" buoy_lst.txt >> cmdfile.buoy fi if [ "$DOBLL_WAV" = 'YES' ] then # Construct wave_outp_cat (bull) call for each buoy in buoy_lst.txt - sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} bull > ${escaped_CATOUTDIR}\/bull_cat_\1.out 2>\&1?" buoy_lst.txt >> cmdfile.buoy + sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_cat.sh \1 ${FHMAX_WAV_PNT} bull > ${escaped_CATOUTDIR}\/bull_cat_\1.out 2>\&1/" buoy_lst.txt >> cmdfile.buoy fi if [ ${CFP_MP:-"NO"} = "YES" ]; then From 456fa0cdefe5ef72e5be2eefaa3b7593111c1718 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Thu, 15 Aug 2024 21:02:34 +0000 Subject: [PATCH 47/49] revert sed separator change for now, more revert needed --- scripts/exgfs_wave_post_pnt.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/exgfs_wave_post_pnt.sh b/scripts/exgfs_wave_post_pnt.sh index e592b9b7cb..0b8874f3fb 100755 --- a/scripts/exgfs_wave_post_pnt.sh +++ b/scripts/exgfs_wave_post_pnt.sh @@ -393,14 +393,14 @@ source "${USHgfs}/preamble.sh" then export dtspec=3600. # Construct the wave_outp_spec (spec) command to run on each buoy in buoy_lst.txt - sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_spec.sh \1 ${ymdh} spec ${escaped_SPECDATA} > ${escaped_SPECDATA}\/spec_\1.out 2>\&1?" buoy_lst.txt >> "tmpcmdfile.${FH3}" + sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_spec.sh \1 ${ymdh} spec ${escaped_SPECDATA} > ${escaped_SPECDATA}\/spec_\1.out 2>\&1/" buoy_lst.txt >> "tmpcmdfile.${FH3}" fi if [ "$DOBLL_WAV" = 'YES' ] then export dtspec=3600. # Construct the wave_outp_spec (bull) command to run on each buoy in buoy_lst.txt - sed "s?^\(.*\)$/${escaped_USHgfs}?wave_outp_spec.sh \1 ${ymdh} bull ${escaped_SPECDATA} > ${escaped_SPECDATA}\/bull_\1.out 2>\&1?" buoy_lst.txt >> "tmpcmdfile.${FH3}" + sed "s/^\(.*\)$/${escaped_USHgfs}\/wave_outp_spec.sh \1 ${ymdh} bull ${escaped_SPECDATA} > ${escaped_SPECDATA}\/bull_\1.out 2>\&1/" buoy_lst.txt >> "tmpcmdfile.${FH3}" fi split -n l/1/10 tmpcmdfile.$FH3 > cmdfile.${FH3}.01 From 35ae83ec94d3cf4f208fdfa6cc5a0e93687673ae Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Wed, 21 Aug 2024 04:41:23 +0000 Subject: [PATCH 48/49] turn off wave on AWS --- parm/config/gefs/config.base | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/parm/config/gefs/config.base b/parm/config/gefs/config.base index fad9e3421a..44f31e58c3 100644 --- a/parm/config/gefs/config.base +++ b/parm/config/gefs/config.base @@ -346,4 +346,10 @@ export DELETE_COM_IN_ARCHIVE_JOB="YES" # NO=retain ROTDIR. YES default in arc # Number of regional collectives to create soundings for export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9} +# The tracker, genesis, and METplus jobs are not supported on AWS yet +# TODO: we should place these in workflow/hosts/awspw.yaml as part of AWS setup, not for general. +if [[ "${machine}" == "AWSPW" ]]; then + export DO_WAVE="NO" +fi + echo "END: config.base" From 761a168d3e6008ff33b4e4cb7d985774a8b425b9 Mon Sep 17 00:00:00 2001 From: Wei Huang Date: Tue, 10 Sep 2024 14:43:23 +0000 Subject: [PATCH 49/49] correct a shell error --- parm/config/gefs/config.resources | 1 + 1 file changed, 1 insertion(+) diff --git a/parm/config/gefs/config.resources b/parm/config/gefs/config.resources index 323d547a34..a96bb02bd9 100644 --- a/parm/config/gefs/config.resources +++ b/parm/config/gefs/config.resources @@ -46,6 +46,7 @@ case ${machine} in "AZUREPW") export PARTITION_BATCH="compute" max_tasks_per_node=24 + ;; "GOOGLEPW") export PARTITION_BATCH="compute" max_tasks_per_node=32