From 9dfe6b0ce95d347ef7a256ae89b0454af0609ccb Mon Sep 17 00:00:00 2001 From: Andrew Shao Date: Thu, 24 Jun 2021 15:21:17 -0600 Subject: [PATCH 01/15] Modify launch_database_cluster to run in CIME To ensure that the vanilla OpenMPI launcher is used to launch the database, the database cluster launcher script needs to be wrapped within a bash script. This also comments out much of the code within the launching script so that the database will live for the duration of the allocation. --- tutorials/cheyenne/cime_database.sh | 8 ++++ tutorials/cheyenne/launch_database_cluster.py | 38 +++++++++---------- 2 files changed, 27 insertions(+), 19 deletions(-) create mode 100644 tutorials/cheyenne/cime_database.sh diff --git a/tutorials/cheyenne/cime_database.sh b/tutorials/cheyenne/cime_database.sh new file mode 100644 index 000000000..8299c9464 --- /dev/null +++ b/tutorials/cheyenne/cime_database.sh @@ -0,0 +1,8 @@ +#!/bin/bash +module purge +# Fill out the required +module load openmpi + +load_conda +conda activate smartsim-test +launch_database_cluster.py diff --git a/tutorials/cheyenne/launch_database_cluster.py b/tutorials/cheyenne/launch_database_cluster.py index f65daf253..c8297ca21 100644 --- a/tutorials/cheyenne/launch_database_cluster.py +++ b/tutorials/cheyenne/launch_database_cluster.py @@ -72,24 +72,24 @@ def launch_cluster_orc(exp, db_hosts, port): db = launch_cluster_orc(exp, db_hosts, db_port) -# test sending some arrays to the database cluster -# the following functions are largely the same across all the -# client languages: C++, C, Fortran, Python - -# only need one address of one shard of DB to connect client -db_address = ":".join((db_hosts[0], str(db_port))) -client = Client(address=db_address, cluster=True) - -# put into database -test_array = np.array([1,2,3,4]) -print(f"Array put in database: {test_array}") -client.put_tensor("test", test_array) - -# get from database -returned_array = client.get_tensor("test") -print(f"Array retrieved from database: {returned_array}") - -# shutdown the database because we don't need it anymore -exp.stop(db) +## test sending some arrays to the database cluster +## the following functions are largely the same across all the +## client languages: C++, C, Fortran, Python +# +## only need one address of one shard of DB to connect client +#db_address = ":".join((db_hosts[0], str(db_port))) +#client = Client(address=db_address, cluster=True) +# +## put into database +#test_array = np.array([1,2,3,4]) +#print(f"Array put in database: {test_array}") +#client.put_tensor("test", test_array) +# +## get from database +#returned_array = client.get_tensor("test") +#print(f"Array retrieved from database: {returned_array}") +# +## shutdown the database because we don't need it anymore +#exp.stop(db) From be52bebe1d1f946e69b0377010de1c73e64d4303 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Tue, 6 Jul 2021 10:56:29 -0600 Subject: [PATCH 02/15] add fortran hello --- tutorials/cheyenne/Makefile | 18 +++++++++++ tutorials/cheyenne/hello.F90 | 19 ++++++++++++ tutorials/cheyenne/launch_database_cluster.py | 30 +++++++++++++++---- 3 files changed, 62 insertions(+), 5 deletions(-) create mode 100644 tutorials/cheyenne/Makefile create mode 100644 tutorials/cheyenne/hello.F90 diff --git a/tutorials/cheyenne/Makefile b/tutorials/cheyenne/Makefile new file mode 100644 index 000000000..f85cb511d --- /dev/null +++ b/tutorials/cheyenne/Makefile @@ -0,0 +1,18 @@ +REDIS_HOME = /glade/u/home/jedwards/sandboxes/cesm2_x_alpha/components/SmartSim/SmartRedis +REDIS_SRC = $(REDIS_HOME)/src/fortran/client.F90 \ + $(REDIS_HOME)/src/fortran/dataset.F90 \ + $(REDIS_HOME)/src/fortran/fortran_c_interop.F90 + +REDIS_OBJ = client.o dataset.o fortran_c_interop.o +MPIFC = mpif90 + + +hello: hello.F90 $(REDIS_OBJ) + $(MPIFC) $< -o $@ $(REDIS_OBJ) -L$(REDIS_HOME)/install/lib -lsmartredis -Wl,-rpath $(REDIS_HOME)/install/lib + +%.o : $(REDIS_HOME)/src/fortran/%.F90 + $(MPIFC) $< -c -o $@ -I $(REDIS_HOME)/install/include + + +client.o: dataset.o +dataset.o: fortran_c_interop.o diff --git a/tutorials/cheyenne/hello.F90 b/tutorials/cheyenne/hello.F90 new file mode 100644 index 000000000..cda0515ed --- /dev/null +++ b/tutorials/cheyenne/hello.F90 @@ -0,0 +1,19 @@ +! Fortran example + program hello + use MPI + use iso_c_binding + use smartredis_client, only : client_type + + integer :: rank, size, ierror, tag, status(MPI_STATUS_SIZE) + type(client_type) :: client + + call MPI_INIT(ierror) + call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierror) + call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierror) + + call client%initialize(.true.) + + print*, 'node', rank, ': Hello world' + call MPI_FINALIZE(ierror) + + end program diff --git a/tutorials/cheyenne/launch_database_cluster.py b/tutorials/cheyenne/launch_database_cluster.py index c8297ca21..f57892013 100644 --- a/tutorials/cheyenne/launch_database_cluster.py +++ b/tutorials/cheyenne/launch_database_cluster.py @@ -1,4 +1,17 @@ +#!/usr/bin/env python3 +#PBS -N smartsimtest +#PBS -r n +#PBS -j oe +#PBS -V +#PBS -l walltime=00:10:00 +#PBS -A P93300606 +#PBS -q regular +#PBS -V +#PBS -S /bin/bash +#PBS -l select=4:ncpus=36:mpiprocs=36:ompthreads=1:nodetype=largemem + import os +import socket import numpy as np from smartsim import Experiment, constants @@ -37,8 +50,13 @@ def collect_db_hosts(num_hosts): if len(hosts) >= num_hosts: return hosts[:num_hosts] + with open(os.path.basename(node_file), "w") as f: + for line in hosts[num_hosts:]: + f.write(line) + print("host is {}".format(line)) + os.environ["PBS_NODEFILE"] = os.path.basename(node_file) else: - raise Exception(f"PBS_NODEFILE had {len(hosts)} hosts, not {num_hosts}") + raise Exception("PBS_NODEFILE {} had {} hosts, not {}".format(node_file, len(hosts),num_hosts)) def launch_cluster_orc(exp, db_hosts, port): @@ -63,6 +81,8 @@ def launch_cluster_orc(exp, db_hosts, port): return db +print("before PBS_NODEFILE is {}".format(os.getenv("PBS_NODEFILE"))) + # create the experiment and specify PBS because cheyenne is a PBS system exp = Experiment("launch_cluster_db", launcher="pbs") @@ -77,7 +97,9 @@ def launch_cluster_orc(exp, db_hosts, port): ## client languages: C++, C, Fortran, Python # ## only need one address of one shard of DB to connect client -#db_address = ":".join((db_hosts[0], str(db_port))) +db_address = ":".join((socket.gethostbyname(db_hosts[0]), str(db_port))) +print("db_address is {}".format(db_address)) +print("after PBS_NODEFILE is {}".format(os.getenv("PBS_NODEFILE"))) #client = Client(address=db_address, cluster=True) # ## put into database @@ -90,6 +112,4 @@ def launch_cluster_orc(exp, db_hosts, port): #print(f"Array retrieved from database: {returned_array}") # ## shutdown the database because we don't need it anymore -#exp.stop(db) - - +exp.stop(db) From 9c940a5160a0977d731743a78428aac88f36400c Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 08:21:08 -0600 Subject: [PATCH 03/15] add casper tutorial --- tutorials/casper/Makefile | 17 +++ tutorials/casper/launch_database_cluster.py | 116 ++++++++++++++++++ tutorials/casper/resv_gpu_job.cmd | 55 +++++++++ tutorials/casper/resv_job.cmd | 75 +++++++++++ tutorials/casper/smartredis_put_get_3D.F90 | 40 ++++++ tutorials/cheyenne/launch_database_cluster.py | 32 +++-- 6 files changed, 324 insertions(+), 11 deletions(-) create mode 100644 tutorials/casper/Makefile create mode 100755 tutorials/casper/launch_database_cluster.py create mode 100644 tutorials/casper/resv_gpu_job.cmd create mode 100644 tutorials/casper/resv_job.cmd create mode 100644 tutorials/casper/smartredis_put_get_3D.F90 diff --git a/tutorials/casper/Makefile b/tutorials/casper/Makefile new file mode 100644 index 000000000..bf7d7ee7b --- /dev/null +++ b/tutorials/casper/Makefile @@ -0,0 +1,17 @@ +REDIS_HOME = $(HOME)/sandboxes/SmartRedis +REDIS_SRC = $(REDIS_HOME)/src/fortran/client.F90 \ + $(REDIS_HOME)/src/fortran/dataset.F90 \ + $(REDIS_HOME)/src/fortran/fortran_c_interop.F90 + +REDIS_OBJ = client.o dataset.o fortran_c_interop.o +MPIFC = mpif90 + +smartredis_put_get_3D: smartredis_put_get_3D.F90 $(REDIS_OBJ) + $(MPIFC) $< -o $@ $(REDIS_OBJ) -L$(REDIS_HOME)/install/lib -lsmartredis -Wl,-rpath $(REDIS_HOME)/install/lib + +%.o : $(REDIS_HOME)/src/fortran/%.F90 + $(MPIFC) $< -c -o $@ -I $(REDIS_HOME)/install/include + + +client.o: dataset.o +dataset.o: fortran_c_interop.o diff --git a/tutorials/casper/launch_database_cluster.py b/tutorials/casper/launch_database_cluster.py new file mode 100755 index 000000000..f0c4db017 --- /dev/null +++ b/tutorials/casper/launch_database_cluster.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +#PBS -N smartsimtest +#PBS -r n +#PBS -j oe +#PBS -V +#PBS -l walltime=00:20:00 +#PBS -A P93300606 +##PBS -q regular +#PBS -V +#PBS -l select=1:ncpus=1:ompthreads=1:mpiprocs=1 + +import os, sys, time +cesmroot = os.environ.get('CESM_ROOT') +if cesmroot is None: + raise SystemExit("ERROR: CESM_ROOT must be defined in environment") + +_LIBDIR = os.path.join(cesmroot,"cime","scripts","Tools") +sys.path.append(_LIBDIR) +_LIBDIR = os.path.join(cesmroot,"cime","scripts","lib") +sys.path.append(_LIBDIR) + +import socket +import numpy as np + +from smartsim import Experiment, constants +from smartsim.database import PBSOrchestrator + +from smartredis import Client +from CIME.utils import run_cmd + +""" +Launch a distributed, in memory database cluster and use the +SmartRedis python client to send and recieve some numpy arrays. + +This example runs in an interactive allocation with at least three +nodes and 1 processor per node. + +i.e. qsub -l select=3:ncpus=1 -l walltime=00:10:00 -A -q premium -I +""" + +def collect_db_hosts(num_hosts): + """A simple method to collect hostnames because we are using + openmpi. (not needed for aprun(ALPS), Slurm, etc. + """ + + hosts = [] + if "PBS_NODEFILE" in os.environ: + node_file = os.environ["PBS_NODEFILE"] + with open(node_file, "r") as f: + for line in f.readlines(): + host = line.split(".")[0] + hosts.append(host) + else: + raise Exception("could not parse interactive allocation nodes from PBS_NODEFILE") + + # account for mpiprocs causing repeats in PBS_NODEFILE + hosts = list(set(hosts)) + if len(hosts) >= num_hosts: + return hosts[:num_hosts] + else: + raise Exception("PBS_NODEFILE {} had {} hosts, not {}".format(node_file, len(hosts),num_hosts)) + + +def launch_cluster_orc(exp, db_hosts, port): + """Just spin up a database cluster, check the status + and tear it down""" + + print(f"Starting Orchestrator on hosts: {db_hosts}") + # batch = False to launch on existing allocation + db = PBSOrchestrator(port=port, db_nodes=len(db_hosts), batch=False, + run_command="mpirun", hosts=db_hosts) + + # generate directories for output files + # pass in objects to make dirs for + exp.generate(db, overwrite=True) + + # start the database on interactive allocation + exp.start(db, block=True) + + # get the status of the database + statuses = exp.get_status(db) + print(f"Status of all database nodes: {statuses}") + + return db + +# create the experiment and specify PBS because cheyenne is a PBS system +exp = Experiment("launch_cluster_db", launcher="pbs") + +db_port = 6780 +db_hosts = collect_db_hosts(1) +# start the database +db = launch_cluster_orc(exp, db_hosts, db_port) + +## test sending some arrays to the database cluster +## the following functions are largely the same across all the +## client languages: C++, C, Fortran, Python +# +## only need one address of one shard of DB to connect client +#db_address = ":".join((socket.gethostbyname(db_hosts[0]), str(db_port))) + +#s, o, e = run_cmd("mpirun -n 36 --hostfile {} ./hello".format(new_host_file), verbose=True) +#print("After hello {} {} {} ".format(s,o,e)) +#client = Client(address=db_address, cluster=True) +# +## put into database +#test_array = np.array([1,2,3,4]) +#print(f"Array put in database: {test_array}") +#client.put_tensor("test", test_array) +# +## get from database +#returned_array = client.get_tensor("test") +#print(f"Array retrieved from database: {returned_array}") +# +## shutdown the database because we don't need it anymore +time.sleep(1200) +exp.stop(db) diff --git a/tutorials/casper/resv_gpu_job.cmd b/tutorials/casper/resv_gpu_job.cmd new file mode 100644 index 000000000..380b86e3f --- /dev/null +++ b/tutorials/casper/resv_gpu_job.cmd @@ -0,0 +1,55 @@ +#!/bin/bash -x +#PBS -N resv_job +#PBS -l select=1:ncpus=36:mpiprocs=36+1:ncpus=1:mpiprocs=1:ngpus=2 +#PBS -l gpu_type=v100 +#PBS -l walltime=00:30:00 +#PBS -W create_resv_from_job=true +#PBS -j oe +#PBS -k oed +#PBS -q casper +#PBS -A P93300606 + +for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') +do + parent_job=$(pbs_rstat -F $rsv|awk '$1 ~ /^reserve_job/{print $3}') + if [[ "${PBS_JOBID}" == "${parent_job}" ]] ; then + rsvname=$rsv + break + fi +done +if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname'"; fi + +me=$(whoami) +pbs_ralter -U $me $rsvname +export CESM_ROOT=/glade/work/jedwards/sandboxes/cesm2_x_alpha.smartsim/ +gpu_jobid=$(qsub -q $rsvname -v CESM_ROOT launch_database_cluster.py) + +head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) +SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):6780" +export SSDB +#./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force +#./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test +#./case.submit +qsub -l walltime=00:20:00 -AP93300606 -q $rsvname -v SSDB ./smartredis_put_get_3D +# clean up + +# +#pbs_rdel $rsvname +cat < cleanup.cmd +#!/bin/bash +#PBS -N cleanup +#PBS -l select=1:ncpus=1:mpiprocs=1 +#PBS -l walltime=00:30:00 +#PBS -j oe +#PBS -k oed +#PBS -A P93300606 +do + running=\$(qstat -Q $rsvname | awk '\$6 ~/[0-9]+/print {\$6, \$7}'} + if [[ "\$running" == "0 0" ]] ; then + pbs_rdel $rsvname + break + fi + sleep 10 +done +EOF1 +qsub -q casper ./cleanup.cmd diff --git a/tutorials/casper/resv_job.cmd b/tutorials/casper/resv_job.cmd new file mode 100644 index 000000000..32d958135 --- /dev/null +++ b/tutorials/casper/resv_job.cmd @@ -0,0 +1,75 @@ +#!/bin/bash -x +#PBS -N resv_job +#PBS -l select=1:ncpus=4:mpiprocs=4+1:ncpus=1:mpiprocs=1 +##PBS -l gpu_type=v100 +#PBS -l walltime=00:30:00 +#PBS -W create_resv_from_job=true +#PBS -j oe +#PBS -k oed +#PBS -q casper +#PBS -A P93300606 + +for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') +do + parent_job=$(pbs_rstat -F $rsv|awk '$1 ~ /^reserve_job/{print $3}') + if [[ "${PBS_JOBID}" == "${parent_job}" ]] ; then + rsvname=$rsv + break + fi +done +if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname'"; fi + +me=$(whoami) +pbs_ralter -U $me $rsvname +export CESM_ROOT=/glade/work/jedwards/sandboxes/cesm2_x_alpha.smartsim/ +db_jobid=$(qsub -q $rsvname -v CESM_ROOT launch_database_cluster.py) + +head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) +# This gets the ib network +#SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):6780" +# This gets the external network +SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):6780" +export SSDB +#./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force +#./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test +#./case.submit + +cat < sr.cmd +#!/bin/bash +#PBS -N smartredis_job +#PBS -l select=1:ncpus=4:mpiprocs=4 +#PBS -l walltime=00:20:00 +#PBS -j oe +#PBS -k oed +#PBS -A P93300606 + +mpirun -np 4 ./smartredis_put_get_3D + +sleep 10 +EOF + +qsub -l walltime=00:20:00 -AP93300606 -q $rsvname -v SSDB ./sr.cmd +# clean up + +# +#pbs_rdel $rsvname +cat < cleanup.cmd +#!/bin/bash +#PBS -N cleanup +#PBS -l select=1:ncpus=1:mpiprocs=1 +#PBS -l walltime=00:30:00 +#PBS -j oe +#PBS -k oed +#PBS -A P93300606 + +for i in \`seq 1 100\`; +do + running=\$(qstat -Q $rsvname | awk '\$6 ~/[0-9]+/ {print \$6, \$7}') + if [[ "\$running" == "0 0" ]] ; then + pbs_rdel $rsvname + break + fi + sleep 10 +done +EOF1 +qsub -q casper ./cleanup.cmd diff --git a/tutorials/casper/smartredis_put_get_3D.F90 b/tutorials/casper/smartredis_put_get_3D.F90 new file mode 100644 index 000000000..199d1eae7 --- /dev/null +++ b/tutorials/casper/smartredis_put_get_3D.F90 @@ -0,0 +1,40 @@ +program main + + use mpi + use iso_c_binding + use smartredis_client, only : client_type + + implicit none + + integer, parameter :: dim1 = 10 + integer, parameter :: dim2 = 20 + integer, parameter :: dim3 = 30 + + real(kind=8), dimension(dim1, dim2, dim3) :: recv_array_real_64 + + real(kind=c_double), dimension(dim1, dim2, dim3) :: true_array_real_64 + + integer :: i, j, k + type(client_type) :: client + + integer :: err_code, pe_id + character(len=9) :: key_prefix + + call MPI_init( err_code ) + call MPI_comm_rank( MPI_COMM_WORLD, pe_id, err_code) + write(key_prefix, "(A,I6.6)") "pe_",pe_id + + call random_number(true_array_real_64) + + call random_number(recv_array_real_64) + + call client%initialize(.false.) + + call client%put_tensor(key_prefix//"true_array_real_64", true_array_real_64, shape(true_array_real_64)) + call client%unpack_tensor(key_prefix//"true_array_real_64", recv_array_real_64, shape(recv_array_real_64)) + if (.not. all(true_array_real_64 == recv_array_real_64)) stop 'true_array_real_64: FAILED' + + call mpi_finalize(err_code) + if (pe_id == 0) write(*,*) "SmartRedis MPI Fortran example 3D put/get finished." + +end program main diff --git a/tutorials/cheyenne/launch_database_cluster.py b/tutorials/cheyenne/launch_database_cluster.py index f57892013..712716d1f 100644 --- a/tutorials/cheyenne/launch_database_cluster.py +++ b/tutorials/cheyenne/launch_database_cluster.py @@ -8,9 +8,17 @@ #PBS -q regular #PBS -V #PBS -S /bin/bash -#PBS -l select=4:ncpus=36:mpiprocs=36:ompthreads=1:nodetype=largemem +#PBS -l select=4:ncpus=36:mpiprocs=36:ompthreads=1 +import os, sys +cesmroot = os.environ.get('CESM_ROOT') +if cesmroot is None: + raise SystemExit("ERROR: CESM_ROOT must be defined in environment") + +_LIBDIR = os.path.join(cesmroot,"cime","scripts","Tools") +sys.path.append(_LIBDIR) +_LIBDIR = os.path.join(cesmroot,"cime","scripts","lib") +sys.path.append(_LIBDIR) -import os import socket import numpy as np @@ -18,7 +26,7 @@ from smartsim.database import PBSOrchestrator from smartredis import Client - +from CIME.utils import run_cmd """ Launch a distributed, in memory database cluster and use the @@ -47,14 +55,12 @@ def collect_db_hosts(num_hosts): # account for mpiprocs causing repeats in PBS_NODEFILE hosts = list(set(hosts)) - if len(hosts) >= num_hosts: - return hosts[:num_hosts] - with open(os.path.basename(node_file), "w") as f: + new_host_file = os.path.basename(node_file) + with open(new_host_file, "w") as f: for line in hosts[num_hosts:]: - f.write(line) - print("host is {}".format(line)) - os.environ["PBS_NODEFILE"] = os.path.basename(node_file) + f.write(line+".ib0.cheyenne.ucar.edu\n") + return hosts[:num_hosts], new_host_file else: raise Exception("PBS_NODEFILE {} had {} hosts, not {}".format(node_file, len(hosts),num_hosts)) @@ -87,11 +93,10 @@ def launch_cluster_orc(exp, db_hosts, port): exp = Experiment("launch_cluster_db", launcher="pbs") db_port = 6780 -db_hosts = collect_db_hosts(3) +db_hosts, new_host_file = collect_db_hosts(3) # start the database db = launch_cluster_orc(exp, db_hosts, db_port) - ## test sending some arrays to the database cluster ## the following functions are largely the same across all the ## client languages: C++, C, Fortran, Python @@ -99,7 +104,12 @@ def launch_cluster_orc(exp, db_hosts, port): ## only need one address of one shard of DB to connect client db_address = ":".join((socket.gethostbyname(db_hosts[0]), str(db_port))) print("db_address is {}".format(db_address)) +os.environ["SSDB"] = db_address +#os.environ["PBS_NODEFILE"] = new_host_file print("after PBS_NODEFILE is {}".format(os.getenv("PBS_NODEFILE"))) + +s, o, e = run_cmd("mpirun -n 36 --hostfile {} ./hello".format(new_host_file), verbose=True) +print("After hello {} {} {} ".format(s,o,e)) #client = Client(address=db_address, cluster=True) # ## put into database From f64b1e4812d9ad887e4d606c6587c0ca01ffecb0 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 16:21:58 -0600 Subject: [PATCH 04/15] casper example --- tutorials/casper/cleanup.template | 9 +++ tutorials/casper/launch.py | 69 +++++++++++++++++ tutorials/casper/launch_client.template | 10 +++ ...er.py => launch_database_cluster.template} | 22 ++---- tutorials/casper/resv_job.cmd | 75 ------------------- .../{resv_gpu_job.cmd => resv_job.template} | 40 ++++------ 6 files changed, 111 insertions(+), 114 deletions(-) create mode 100644 tutorials/casper/cleanup.template create mode 100755 tutorials/casper/launch.py create mode 100644 tutorials/casper/launch_client.template rename tutorials/casper/{launch_database_cluster.py => launch_database_cluster.template} (86%) delete mode 100644 tutorials/casper/resv_job.cmd rename tutorials/casper/{resv_gpu_job.cmd => resv_job.template} (50%) diff --git a/tutorials/casper/cleanup.template b/tutorials/casper/cleanup.template new file mode 100644 index 000000000..d304cfd91 --- /dev/null +++ b/tutorials/casper/cleanup.template @@ -0,0 +1,9 @@ +#!/bin/bash +#PBS -N cleanup +#PBS -l select=1:ncpus=1:mpiprocs=1 +#PBS -l walltime=00:30:00 +#PBS -j oe +#PBS -k oed +#PBS -A $account + +pbs_rdel $RSVNAME diff --git a/tutorials/casper/launch.py b/tutorials/casper/launch.py new file mode 100755 index 000000000..b97f7865c --- /dev/null +++ b/tutorials/casper/launch.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +import os, sys + +import argparse, subprocess +from string import Template + +def parse_command_line(args, description): + parser = argparse.ArgumentParser(description=description, + formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("--db-nodes", default=1, + help="Number of nodes for the SmartSim database") + parser.add_argument("--ngpus-per-node", default=0, + help="Number of gpus per SmartSim database node") + parser.add_argument("--walltime", default="00:30:00", + help="Total walltime for submitted job") + parser.add_argument("--ensemble-size", default=1, + help="Number of ensemble members to run") + parser.add_argument("--member-nodes", default=1, + help="Number of nodes per ensemble member") + parser.add_argument("--account", default="P93300606", + help="Account ID") + parser.add_argument("--db-port", default=6780, + help="db port") + + args = parser.parse_args(args[1:]) + ngpus = "" + if int(args.ngpus_per_node) > 0: + ngpus = ":ngpus="+args.ngpus_per_node + + + return {"db_nodes":args.db_nodes, "ngpus": ngpus, "client_nodes": args.ensemble_size*args.member_nodes, + "walltime": args.walltime, "account" : args.account, "member_nodes": args.member_nodes, + "ensemble_size": args.ensemble_size, "db_port": args.db_port} + +def execute(command): + """ + Function for running a command on shell. + Args: + command (str): + command that we want to run. + Raises: + Error with the return code from shell. + """ + print ('\n',' >> ',*command,'\n') + + try: + subprocess.check_call(command, stdout=sys.stdout, stderr=subprocess.STDOUT) + + except subprocess.CalledProcessError as e: + raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + + +def _main_func(desc): + templatevars = parse_command_line(sys.argv, desc) + + template_files = ["resv_job.template", "launch_database_cluster.template", "launch_client.template", "cleanup.template"] + + for template in template_files: + with open(template) as f: + src = Template(f.read()) + result = src.safe_substitute(templatevars) + result_file = template.replace("template","sh") + with open(result_file, "w") as f: + f.write(result) + + execute(['qsub', 'resv_job.sh']) + +if __name__ == "__main__": + _main_func(__doc__) diff --git a/tutorials/casper/launch_client.template b/tutorials/casper/launch_client.template new file mode 100644 index 000000000..91fe6d0a6 --- /dev/null +++ b/tutorials/casper/launch_client.template @@ -0,0 +1,10 @@ +#!/bin/bash +#PBS -N ss_client +#PBS -l select=$member_nodes:ncpus=36:mpiprocs=36 +#PBS -l walltime=$walltime +#PBS -j oe +#PBS -k oed +#PBS -A $account + +np=$(expr $member_nodes \* 36) +mpirun -np $np ./smartredis_put_get_3D diff --git a/tutorials/casper/launch_database_cluster.py b/tutorials/casper/launch_database_cluster.template similarity index 86% rename from tutorials/casper/launch_database_cluster.py rename to tutorials/casper/launch_database_cluster.template index f0c4db017..d142d8a10 100755 --- a/tutorials/casper/launch_database_cluster.py +++ b/tutorials/casper/launch_database_cluster.template @@ -3,20 +3,15 @@ #PBS -r n #PBS -j oe #PBS -V -#PBS -l walltime=00:20:00 -#PBS -A P93300606 +#PBS -l walltime=$walltime +#PBS -A $account ##PBS -q regular #PBS -V -#PBS -l select=1:ncpus=1:ompthreads=1:mpiprocs=1 +#PBS -l select=$db_nodes:ncpus=1:ompthreads=1:mpiprocs=1$ngpus import os, sys, time -cesmroot = os.environ.get('CESM_ROOT') -if cesmroot is None: - raise SystemExit("ERROR: CESM_ROOT must be defined in environment") -_LIBDIR = os.path.join(cesmroot,"cime","scripts","Tools") -sys.path.append(_LIBDIR) -_LIBDIR = os.path.join(cesmroot,"cime","scripts","lib") +_LIBDIR = "/glade/work/jedwards/casper_npl_clone/lib/python3.7/site-packages/" sys.path.append(_LIBDIR) import socket @@ -26,7 +21,6 @@ from smartsim.database import PBSOrchestrator from smartredis import Client -from CIME.utils import run_cmd """ Launch a distributed, in memory database cluster and use the @@ -86,8 +80,8 @@ def launch_cluster_orc(exp, db_hosts, port): # create the experiment and specify PBS because cheyenne is a PBS system exp = Experiment("launch_cluster_db", launcher="pbs") -db_port = 6780 -db_hosts = collect_db_hosts(1) +db_port = $db_port +db_hosts = collect_db_hosts($db_nodes) # start the database db = launch_cluster_orc(exp, db_hosts, db_port) @@ -112,5 +106,5 @@ def launch_cluster_orc(exp, db_hosts, port): #print(f"Array retrieved from database: {returned_array}") # ## shutdown the database because we don't need it anymore -time.sleep(1200) -exp.stop(db) +#time.sleep(1200) +#exp.stop(db) diff --git a/tutorials/casper/resv_job.cmd b/tutorials/casper/resv_job.cmd deleted file mode 100644 index 32d958135..000000000 --- a/tutorials/casper/resv_job.cmd +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -x -#PBS -N resv_job -#PBS -l select=1:ncpus=4:mpiprocs=4+1:ncpus=1:mpiprocs=1 -##PBS -l gpu_type=v100 -#PBS -l walltime=00:30:00 -#PBS -W create_resv_from_job=true -#PBS -j oe -#PBS -k oed -#PBS -q casper -#PBS -A P93300606 - -for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') -do - parent_job=$(pbs_rstat -F $rsv|awk '$1 ~ /^reserve_job/{print $3}') - if [[ "${PBS_JOBID}" == "${parent_job}" ]] ; then - rsvname=$rsv - break - fi -done -if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is set to '$rsvname'"; fi - -me=$(whoami) -pbs_ralter -U $me $rsvname -export CESM_ROOT=/glade/work/jedwards/sandboxes/cesm2_x_alpha.smartsim/ -db_jobid=$(qsub -q $rsvname -v CESM_ROOT launch_database_cluster.py) - -head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) -# This gets the ib network -#SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):6780" -# This gets the external network -SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):6780" -export SSDB -#./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force -#./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test -#./case.submit - -cat < sr.cmd -#!/bin/bash -#PBS -N smartredis_job -#PBS -l select=1:ncpus=4:mpiprocs=4 -#PBS -l walltime=00:20:00 -#PBS -j oe -#PBS -k oed -#PBS -A P93300606 - -mpirun -np 4 ./smartredis_put_get_3D - -sleep 10 -EOF - -qsub -l walltime=00:20:00 -AP93300606 -q $rsvname -v SSDB ./sr.cmd -# clean up - -# -#pbs_rdel $rsvname -cat < cleanup.cmd -#!/bin/bash -#PBS -N cleanup -#PBS -l select=1:ncpus=1:mpiprocs=1 -#PBS -l walltime=00:30:00 -#PBS -j oe -#PBS -k oed -#PBS -A P93300606 - -for i in \`seq 1 100\`; -do - running=\$(qstat -Q $rsvname | awk '\$6 ~/[0-9]+/ {print \$6, \$7}') - if [[ "\$running" == "0 0" ]] ; then - pbs_rdel $rsvname - break - fi - sleep 10 -done -EOF1 -qsub -q casper ./cleanup.cmd diff --git a/tutorials/casper/resv_gpu_job.cmd b/tutorials/casper/resv_job.template similarity index 50% rename from tutorials/casper/resv_gpu_job.cmd rename to tutorials/casper/resv_job.template index 380b86e3f..795cc48b7 100644 --- a/tutorials/casper/resv_gpu_job.cmd +++ b/tutorials/casper/resv_job.template @@ -1,13 +1,13 @@ #!/bin/bash -x #PBS -N resv_job -#PBS -l select=1:ncpus=36:mpiprocs=36+1:ncpus=1:mpiprocs=1:ngpus=2 +#PBS -l select=$db_nodes:ncpus=1:mpiprocs=1$ngpus+$client_nodes:ncpus=36:mpiprocs=36 #PBS -l gpu_type=v100 -#PBS -l walltime=00:30:00 +#PBS -l walltime=$walltime #PBS -W create_resv_from_job=true #PBS -j oe #PBS -k oed #PBS -q casper -#PBS -A P93300606 +#PBS -A $account for rsv in $(qstat -Q|awk '$1 ~ /^R/{print $1}') do @@ -21,35 +21,25 @@ if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is se me=$(whoami) pbs_ralter -U $me $rsvname -export CESM_ROOT=/glade/work/jedwards/sandboxes/cesm2_x_alpha.smartsim/ -gpu_jobid=$(qsub -q $rsvname -v CESM_ROOT launch_database_cluster.py) +db_jobid=$(qsub -q $rsvname launch_database_cluster.sh) head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) -SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):6780" +# This gets the ib network +SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):$db_port" +# This gets the external network +#SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port" export SSDB #./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force #./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test #./case.submit -qsub -l walltime=00:20:00 -AP93300606 -q $rsvname -v SSDB ./smartredis_put_get_3D +for i in `seq 1 $ensemble_size`; +do + client_id=$(qsub -q $rsvname -v SSDB ./launch_client.sh) +done + # clean up # #pbs_rdel $rsvname -cat < cleanup.cmd -#!/bin/bash -#PBS -N cleanup -#PBS -l select=1:ncpus=1:mpiprocs=1 -#PBS -l walltime=00:30:00 -#PBS -j oe -#PBS -k oed -#PBS -A P93300606 -do - running=\$(qstat -Q $rsvname | awk '\$6 ~/[0-9]+/print {\$6, \$7}'} - if [[ "\$running" == "0 0" ]] ; then - pbs_rdel $rsvname - break - fi - sleep 10 -done -EOF1 -qsub -q casper ./cleanup.cmd +export RSVNAME=$rsvname +qsub -q casper -W depend=afterany:$client_id:$db_jobid -v RSVNAME ./cleanup.sh From e2161c524746b97117326dcc51d37ee127f29514 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 16:46:06 -0600 Subject: [PATCH 05/15] document --- tutorials/casper/README.md | 39 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 tutorials/casper/README.md diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md new file mode 100644 index 000000000..20fe7115b --- /dev/null +++ b/tutorials/casper/README.md @@ -0,0 +1,39 @@ + +# Casper + +```bash +module purge +module load gnu/9.1.0 ncarcompilers openmpi netcdf ncarenv cmake +``` + +I also needed a newer version of gmake, it's in /glade/work/jedwards/make-4.3/bin/make + +I am using a python environment created with: +``` +ncar_pylib -c 20201220 /glade/work/$USER/casper_npl_clone +``` + +``pip install smartsim`` +``smart --device gpu`` +``pip install smartredis`` + +launch.py is the primary launch script +``` +usage: launch.py [-h] [--db-nodes DB_NODES] [--ngpus-per-node NGPUS_PER_NODE] + [--walltime WALLTIME] [--ensemble-size ENSEMBLE_SIZE] + [--member-nodes MEMBER_NODES] [--account ACCOUNT] + [--db-port DB_PORT] + +optional arguments: + -h, --help show this help message and exit + --db-nodes DB_NODES Number of nodes for the SmartSim database + --ngpus-per-node NGPUS_PER_NODE + Number of gpus per SmartSim database node + --walltime WALLTIME Total walltime for submitted job + --ensemble-size ENSEMBLE_SIZE + Number of ensemble members to run + --member-nodes MEMBER_NODES + Number of nodes per ensemble member + --account ACCOUNT Account ID + --db-port DB_PORT db port +``` From 712ad6eeda7bae7d2ba2cf049a372856d426a5fa Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 16:47:29 -0600 Subject: [PATCH 06/15] document --- tutorials/casper/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index 20fe7115b..0b69eb26b 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -26,14 +26,14 @@ usage: launch.py [-h] [--db-nodes DB_NODES] [--ngpus-per-node NGPUS_PER_NODE] optional arguments: -h, --help show this help message and exit - --db-nodes DB_NODES Number of nodes for the SmartSim database + --db-nodes DB_NODES Number of nodes for the SmartSim database, default=1 --ngpus-per-node NGPUS_PER_NODE - Number of gpus per SmartSim database node - --walltime WALLTIME Total walltime for submitted job + Number of gpus per SmartSim database node, default=0 + --walltime WALLTIME Total walltime for submitted job, default=00:30:00 --ensemble-size ENSEMBLE_SIZE - Number of ensemble members to run + Number of ensemble members to run, default=1 --member-nodes MEMBER_NODES - Number of nodes per ensemble member + Number of nodes per ensemble member, default=1 --account ACCOUNT Account ID - --db-port DB_PORT db port + --db-port DB_PORT db port, default=6780 ``` From cf9c03bd3e842c3633b9d350e0fd5d90df1d160f Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 16:53:28 -0600 Subject: [PATCH 07/15] document --- tutorials/casper/README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index 0b69eb26b..3fa6489f4 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -37,3 +37,13 @@ optional arguments: --account ACCOUNT Account ID --db-port DB_PORT db port, default=6780 ``` +It creates pbs jobs from each of the 4 templates +1. resv_job.template +2. launch_database_cluster.template +3. launch_client.sh +4. cleanup.sh + +and submits the resv_job.sh which in turn will create a reservation large enough for the db and all the ensemble members +then it submits those jobs in the newly created reservation. It starts the database and sets the SSDB environment variable +then launchs each of the clients, all of this is done within the newly created reservation. It also launchs a cleanup script +that will remove the reservation when all jobs are complete. \ No newline at end of file From 8c37f9405ee353c1a468e2321ca2d822d166d63d Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 16:54:12 -0600 Subject: [PATCH 08/15] fix document --- tutorials/casper/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index 3fa6489f4..b28669860 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -40,8 +40,8 @@ optional arguments: It creates pbs jobs from each of the 4 templates 1. resv_job.template 2. launch_database_cluster.template -3. launch_client.sh -4. cleanup.sh +3. launch_client.template +4. cleanup.template and submits the resv_job.sh which in turn will create a reservation large enough for the db and all the ensemble members then it submits those jobs in the newly created reservation. It starts the database and sets the SSDB environment variable From 30b779ec3ae222e12c337b592e85f60d597ce3d2 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Wed, 14 Jul 2021 17:02:59 -0600 Subject: [PATCH 09/15] fix document --- tutorials/casper/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index b28669860..d220cfc7e 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -14,9 +14,17 @@ ncar_pylib -c 20201220 /glade/work/$USER/casper_npl_clone ``` ``pip install smartsim`` + ``smart --device gpu`` + ``pip install smartredis`` +First you need to build the smartredis_put_get_3D.F90 fortran example: +``` +make +``` + + launch.py is the primary launch script ``` usage: launch.py [-h] [--db-nodes DB_NODES] [--ngpus-per-node NGPUS_PER_NODE] From 5281f34bf18a451f2a5ed4cc39fed5e2df05e406 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 08:08:25 -0600 Subject: [PATCH 10/15] improve libpath and cleanup --- tutorials/casper/cleanup.template | 9 -- tutorials/casper/launch.py | 37 +++----- .../casper/launch_database_cluster.template | 57 ++++++------- tutorials/casper/resv_job.template | 9 +- tutorials/casper/utils.py | 85 +++++++++++++++++++ 5 files changed, 126 insertions(+), 71 deletions(-) delete mode 100644 tutorials/casper/cleanup.template mode change 100755 => 100644 tutorials/casper/launch_database_cluster.template create mode 100644 tutorials/casper/utils.py diff --git a/tutorials/casper/cleanup.template b/tutorials/casper/cleanup.template deleted file mode 100644 index d304cfd91..000000000 --- a/tutorials/casper/cleanup.template +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -#PBS -N cleanup -#PBS -l select=1:ncpus=1:mpiprocs=1 -#PBS -l walltime=00:30:00 -#PBS -j oe -#PBS -k oed -#PBS -A $account - -pbs_rdel $RSVNAME diff --git a/tutorials/casper/launch.py b/tutorials/casper/launch.py index b97f7865c..3f7629d40 100755 --- a/tutorials/casper/launch.py +++ b/tutorials/casper/launch.py @@ -3,24 +3,25 @@ import argparse, subprocess from string import Template +from utils import run_cmd def parse_command_line(args, description): parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--db-nodes", default=1, - help="Number of nodes for the SmartSim database") + help="Number of nodes for the SmartSim database, default=1") parser.add_argument("--ngpus-per-node", default=0, - help="Number of gpus per SmartSim database node") + help="Number of gpus per SmartSim database node, default=0") parser.add_argument("--walltime", default="00:30:00", - help="Total walltime for submitted job") + help="Total walltime for submitted job, default=00:30:00") parser.add_argument("--ensemble-size", default=1, - help="Number of ensemble members to run") + help="Number of ensemble members to run, default=1") parser.add_argument("--member-nodes", default=1, - help="Number of nodes per ensemble member") + help="Number of nodes per ensemble member, default=1") parser.add_argument("--account", default="P93300606", help="Account ID") parser.add_argument("--db-port", default=6780, - help="db port") + help="db port, default=6780") args = parser.parse_args(args[1:]) ngpus = "" @@ -30,30 +31,12 @@ def parse_command_line(args, description): return {"db_nodes":args.db_nodes, "ngpus": ngpus, "client_nodes": args.ensemble_size*args.member_nodes, "walltime": args.walltime, "account" : args.account, "member_nodes": args.member_nodes, - "ensemble_size": args.ensemble_size, "db_port": args.db_port} - -def execute(command): - """ - Function for running a command on shell. - Args: - command (str): - command that we want to run. - Raises: - Error with the return code from shell. - """ - print ('\n',' >> ',*command,'\n') - - try: - subprocess.check_call(command, stdout=sys.stdout, stderr=subprocess.STDOUT) - - except subprocess.CalledProcessError as e: - raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - + "ensemble_size": args.ensemble_size, "db_port": args.db_port, "python_sys_path": sys.path} def _main_func(desc): templatevars = parse_command_line(sys.argv, desc) - template_files = ["resv_job.template", "launch_database_cluster.template", "launch_client.template", "cleanup.template"] + template_files = ["resv_job.template", "launch_database_cluster.template", "launch_client.template"] for template in template_files: with open(template) as f: @@ -63,7 +46,7 @@ def _main_func(desc): with open(result_file, "w") as f: f.write(result) - execute(['qsub', 'resv_job.sh']) + run_cmd("qsub resv_job.sh", verbose=True) if __name__ == "__main__": _main_func(__doc__) diff --git a/tutorials/casper/launch_database_cluster.template b/tutorials/casper/launch_database_cluster.template old mode 100755 new mode 100644 index d142d8a10..10f47018c --- a/tutorials/casper/launch_database_cluster.template +++ b/tutorials/casper/launch_database_cluster.template @@ -11,17 +11,16 @@ import os, sys, time -_LIBDIR = "/glade/work/jedwards/casper_npl_clone/lib/python3.7/site-packages/" -sys.path.append(_LIBDIR) +# The python environment is not passed properly to submitted jobs on casper +_LIBDIR = $python_sys_path +sys.path.extend(_LIBDIR) -import socket +import socket, subprocess import numpy as np - +from utils import run_cmd from smartsim import Experiment, constants from smartsim.database import PBSOrchestrator -from smartredis import Client - """ Launch a distributed, in memory database cluster and use the SmartRedis python client to send and recieve some numpy arrays. @@ -77,6 +76,21 @@ def launch_cluster_orc(exp, db_hosts, port): return db +def monitor_client_jobs(rsvname): + jobs_done=False + while not jobs_done: + s, o, e = run_cmd("qstat -q {}".format(rsvname), verbose=True) + jobs_left = o.split()[-2:] + print("Jobs left: Running {} Queued {}".format(int(jobs_left[0]),int(jobs_left[1]))) + if int(jobs_left[0]) + int(jobs_left[1]) == 1: + jobs_done = True + else: + time.sleep(60) + + + + + # create the experiment and specify PBS because cheyenne is a PBS system exp = Experiment("launch_cluster_db", launcher="pbs") @@ -85,26 +99,11 @@ db_hosts = collect_db_hosts($db_nodes) # start the database db = launch_cluster_orc(exp, db_hosts, db_port) -## test sending some arrays to the database cluster -## the following functions are largely the same across all the -## client languages: C++, C, Fortran, Python -# -## only need one address of one shard of DB to connect client -#db_address = ":".join((socket.gethostbyname(db_hosts[0]), str(db_port))) - -#s, o, e = run_cmd("mpirun -n 36 --hostfile {} ./hello".format(new_host_file), verbose=True) -#print("After hello {} {} {} ".format(s,o,e)) -#client = Client(address=db_address, cluster=True) -# -## put into database -#test_array = np.array([1,2,3,4]) -#print(f"Array put in database: {test_array}") -#client.put_tensor("test", test_array) -# -## get from database -#returned_array = client.get_tensor("test") -#print(f"Array retrieved from database: {returned_array}") -# -## shutdown the database because we don't need it anymore -#time.sleep(1200) -#exp.stop(db) +rsvname = os.environ["RSVNAME"] +# stay alive until client jobs have completed +monitor_client_jobs(rsvname) + +# shutdown the database because we don't need it anymore +exp.stop(db) +# delete the job reservation +run_cmd("pbs_rdel {}".format(rsvname)) diff --git a/tutorials/casper/resv_job.template b/tutorials/casper/resv_job.template index 795cc48b7..66d6a5626 100644 --- a/tutorials/casper/resv_job.template +++ b/tutorials/casper/resv_job.template @@ -21,7 +21,8 @@ if [ -z $rsvname ]; then echo "rsv is unset"; exit -1; else echo "rsv name is se me=$(whoami) pbs_ralter -U $me $rsvname -db_jobid=$(qsub -q $rsvname launch_database_cluster.sh) + +db_jobid=$(qsub -q $rsvname -vRSVNAME=$rsvname launch_database_cluster.sh) head_host=$(qstat -f $PBS_JOBID|awk '$1 ~ /^exec_host$/{print $3}'|cut -d\/ -f1-1) # This gets the ib network @@ -37,9 +38,5 @@ do client_id=$(qsub -q $rsvname -v SSDB ./launch_client.sh) done -# clean up -# -#pbs_rdel $rsvname -export RSVNAME=$rsvname -qsub -q casper -W depend=afterany:$client_id:$db_jobid -v RSVNAME ./cleanup.sh + diff --git a/tutorials/casper/utils.py b/tutorials/casper/utils.py new file mode 100644 index 000000000..faf24b609 --- /dev/null +++ b/tutorials/casper/utils.py @@ -0,0 +1,85 @@ +import subprocess, os, io + +def _convert_to_fd(filearg, from_dir, mode="a"): + filearg = _get_path(filearg, from_dir) + + return open(filearg, mode) + +_hack=object() + +def run_cmd(cmd, input_str=None, from_dir=None, verbose=None, + arg_stdout=_hack, arg_stderr=_hack, env=None, + combine_output=False, timeout=None, executable=None): + """ + Wrapper around subprocess to make it much more convenient to run shell commands + + >>> run_cmd('ls file_i_hope_doesnt_exist')[0] != 0 + True + """ + + # Real defaults for these value should be subprocess.PIPE + if arg_stdout is _hack: + arg_stdout = subprocess.PIPE + elif isinstance(arg_stdout, str): + arg_stdout = _convert_to_fd(arg_stdout, from_dir) + + if arg_stderr is _hack: + arg_stderr = subprocess.STDOUT if combine_output else subprocess.PIPE + elif isinstance(arg_stderr, str): + arg_stderr = _convert_to_fd(arg_stdout, from_dir) + + if verbose: + print("RUN: {}\nFROM: {}".format(cmd, os.getcwd() if from_dir is None else from_dir)) + + if (input_str is not None): + stdin = subprocess.PIPE + else: + stdin = None + + proc = subprocess.Popen(cmd, + shell=True, + stdout=arg_stdout, + stderr=arg_stderr, + stdin=stdin, + cwd=from_dir, + executable=executable, + env=env) + + output, errput = proc.communicate(input_str) + + # In Python3, subprocess.communicate returns bytes. We want to work with strings + # as much as possible, so we convert bytes to string (which is unicode in py3) via + # decode. + if output is not None: + try: + output = output.decode('utf-8', errors='ignore') + except AttributeError: + pass + if errput is not None: + try: + errput = errput.decode('utf-8', errors='ignore') + except AttributeError: + pass + + # Always strip outputs + if output: + output = output.strip() + if errput: + errput = errput.strip() + + stat = proc.wait() + if isinstance(arg_stdout, io.IOBase): + arg_stdout.close() # pylint: disable=no-member + if isinstance(arg_stderr, io.IOBase) and arg_stderr is not arg_stdout: + arg_stderr.close() # pylint: disable=no-member + + + if verbose: + if stat != 0: + print(" stat: {:d}\n".format(stat)) + if output: + print(" output: {}\n".format(output)) + if errput: + print(" errput: {}\n".format(errput)) + + return stat, output, errput From f481bd7f9a95f576499303acef65eebde6c0ba39 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 08:12:30 -0600 Subject: [PATCH 11/15] undo cheyenne changes --- tutorials/cheyenne/launch_database_cluster.py | 84 ++++++------------- 1 file changed, 27 insertions(+), 57 deletions(-) diff --git a/tutorials/cheyenne/launch_database_cluster.py b/tutorials/cheyenne/launch_database_cluster.py index 712716d1f..f65daf253 100644 --- a/tutorials/cheyenne/launch_database_cluster.py +++ b/tutorials/cheyenne/launch_database_cluster.py @@ -1,32 +1,11 @@ -#!/usr/bin/env python3 -#PBS -N smartsimtest -#PBS -r n -#PBS -j oe -#PBS -V -#PBS -l walltime=00:10:00 -#PBS -A P93300606 -#PBS -q regular -#PBS -V -#PBS -S /bin/bash -#PBS -l select=4:ncpus=36:mpiprocs=36:ompthreads=1 -import os, sys -cesmroot = os.environ.get('CESM_ROOT') -if cesmroot is None: - raise SystemExit("ERROR: CESM_ROOT must be defined in environment") - -_LIBDIR = os.path.join(cesmroot,"cime","scripts","Tools") -sys.path.append(_LIBDIR) -_LIBDIR = os.path.join(cesmroot,"cime","scripts","lib") -sys.path.append(_LIBDIR) - -import socket +import os import numpy as np from smartsim import Experiment, constants from smartsim.database import PBSOrchestrator from smartredis import Client -from CIME.utils import run_cmd + """ Launch a distributed, in memory database cluster and use the @@ -55,14 +34,11 @@ def collect_db_hosts(num_hosts): # account for mpiprocs causing repeats in PBS_NODEFILE hosts = list(set(hosts)) + if len(hosts) >= num_hosts: - new_host_file = os.path.basename(node_file) - with open(new_host_file, "w") as f: - for line in hosts[num_hosts:]: - f.write(line+".ib0.cheyenne.ucar.edu\n") - return hosts[:num_hosts], new_host_file + return hosts[:num_hosts] else: - raise Exception("PBS_NODEFILE {} had {} hosts, not {}".format(node_file, len(hosts),num_hosts)) + raise Exception(f"PBS_NODEFILE had {len(hosts)} hosts, not {num_hosts}") def launch_cluster_orc(exp, db_hosts, port): @@ -87,39 +63,33 @@ def launch_cluster_orc(exp, db_hosts, port): return db -print("before PBS_NODEFILE is {}".format(os.getenv("PBS_NODEFILE"))) - # create the experiment and specify PBS because cheyenne is a PBS system exp = Experiment("launch_cluster_db", launcher="pbs") db_port = 6780 -db_hosts, new_host_file = collect_db_hosts(3) +db_hosts = collect_db_hosts(3) # start the database db = launch_cluster_orc(exp, db_hosts, db_port) -## test sending some arrays to the database cluster -## the following functions are largely the same across all the -## client languages: C++, C, Fortran, Python -# -## only need one address of one shard of DB to connect client -db_address = ":".join((socket.gethostbyname(db_hosts[0]), str(db_port))) -print("db_address is {}".format(db_address)) -os.environ["SSDB"] = db_address -#os.environ["PBS_NODEFILE"] = new_host_file -print("after PBS_NODEFILE is {}".format(os.getenv("PBS_NODEFILE"))) - -s, o, e = run_cmd("mpirun -n 36 --hostfile {} ./hello".format(new_host_file), verbose=True) -print("After hello {} {} {} ".format(s,o,e)) -#client = Client(address=db_address, cluster=True) -# -## put into database -#test_array = np.array([1,2,3,4]) -#print(f"Array put in database: {test_array}") -#client.put_tensor("test", test_array) -# -## get from database -#returned_array = client.get_tensor("test") -#print(f"Array retrieved from database: {returned_array}") -# -## shutdown the database because we don't need it anymore + +# test sending some arrays to the database cluster +# the following functions are largely the same across all the +# client languages: C++, C, Fortran, Python + +# only need one address of one shard of DB to connect client +db_address = ":".join((db_hosts[0], str(db_port))) +client = Client(address=db_address, cluster=True) + +# put into database +test_array = np.array([1,2,3,4]) +print(f"Array put in database: {test_array}") +client.put_tensor("test", test_array) + +# get from database +returned_array = client.get_tensor("test") +print(f"Array retrieved from database: {returned_array}") + +# shutdown the database because we don't need it anymore exp.stop(db) + + From c7413fc93d9041b03159959c21b94495792a4862 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 08:14:12 -0600 Subject: [PATCH 12/15] undo cheyenne changes --- tutorials/cheyenne/Makefile | 18 ------------------ tutorials/cheyenne/cime_database.sh | 8 -------- tutorials/cheyenne/hello.F90 | 19 ------------------- 3 files changed, 45 deletions(-) delete mode 100644 tutorials/cheyenne/Makefile delete mode 100644 tutorials/cheyenne/cime_database.sh delete mode 100644 tutorials/cheyenne/hello.F90 diff --git a/tutorials/cheyenne/Makefile b/tutorials/cheyenne/Makefile deleted file mode 100644 index f85cb511d..000000000 --- a/tutorials/cheyenne/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -REDIS_HOME = /glade/u/home/jedwards/sandboxes/cesm2_x_alpha/components/SmartSim/SmartRedis -REDIS_SRC = $(REDIS_HOME)/src/fortran/client.F90 \ - $(REDIS_HOME)/src/fortran/dataset.F90 \ - $(REDIS_HOME)/src/fortran/fortran_c_interop.F90 - -REDIS_OBJ = client.o dataset.o fortran_c_interop.o -MPIFC = mpif90 - - -hello: hello.F90 $(REDIS_OBJ) - $(MPIFC) $< -o $@ $(REDIS_OBJ) -L$(REDIS_HOME)/install/lib -lsmartredis -Wl,-rpath $(REDIS_HOME)/install/lib - -%.o : $(REDIS_HOME)/src/fortran/%.F90 - $(MPIFC) $< -c -o $@ -I $(REDIS_HOME)/install/include - - -client.o: dataset.o -dataset.o: fortran_c_interop.o diff --git a/tutorials/cheyenne/cime_database.sh b/tutorials/cheyenne/cime_database.sh deleted file mode 100644 index 8299c9464..000000000 --- a/tutorials/cheyenne/cime_database.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -module purge -# Fill out the required -module load openmpi - -load_conda -conda activate smartsim-test -launch_database_cluster.py diff --git a/tutorials/cheyenne/hello.F90 b/tutorials/cheyenne/hello.F90 deleted file mode 100644 index cda0515ed..000000000 --- a/tutorials/cheyenne/hello.F90 +++ /dev/null @@ -1,19 +0,0 @@ -! Fortran example - program hello - use MPI - use iso_c_binding - use smartredis_client, only : client_type - - integer :: rank, size, ierror, tag, status(MPI_STATUS_SIZE) - type(client_type) :: client - - call MPI_INIT(ierror) - call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierror) - call MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierror) - - call client%initialize(.true.) - - print*, 'node', rank, ': Hello world' - call MPI_FINALIZE(ierror) - - end program From 677ff5c211ec94942e75cd5e8cabd2b30139b758 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 08:22:25 -0600 Subject: [PATCH 13/15] update README --- tutorials/casper/README.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index d220cfc7e..10499cba9 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -24,7 +24,6 @@ First you need to build the smartredis_put_get_3D.F90 fortran example: make ``` - launch.py is the primary launch script ``` usage: launch.py [-h] [--db-nodes DB_NODES] [--ngpus-per-node NGPUS_PER_NODE] @@ -45,13 +44,11 @@ optional arguments: --account ACCOUNT Account ID --db-port DB_PORT db port, default=6780 ``` -It creates pbs jobs from each of the 4 templates +It creates pbs jobs from each of the 3 templates 1. resv_job.template 2. launch_database_cluster.template 3. launch_client.template -4. cleanup.template -and submits the resv_job.sh which in turn will create a reservation large enough for the db and all the ensemble members -then it submits those jobs in the newly created reservation. It starts the database and sets the SSDB environment variable -then launchs each of the clients, all of this is done within the newly created reservation. It also launchs a cleanup script -that will remove the reservation when all jobs are complete. \ No newline at end of file +and submits the resv_job.sh which in turn will create a reservation large enough for the db and all the ensemble members. +It submits those jobs in the newly created reservation. It starts the database and sets the SSDB environment variable +then launchs each of the clients, all of this is done within the newly created reservation. The database job monitors progress of the clients and exits and removes the reservation when it is complete. \ No newline at end of file From ae136db4a499daa44367c9b814756e3abc0bbcef Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Fri, 16 Jul 2021 08:47:05 -0600 Subject: [PATCH 14/15] add note about asking permission --- tutorials/casper/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index 10499cba9..e7da2842f 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -51,4 +51,7 @@ It creates pbs jobs from each of the 3 templates and submits the resv_job.sh which in turn will create a reservation large enough for the db and all the ensemble members. It submits those jobs in the newly created reservation. It starts the database and sets the SSDB environment variable -then launchs each of the clients, all of this is done within the newly created reservation. The database job monitors progress of the clients and exits and removes the reservation when it is complete. \ No newline at end of file +then launchs each of the clients, all of this is done within the newly created reservation. The database job monitors progress of the clients and exits and removes the reservation when it is complete. + +** Currently to use this feature you must first send a note to cislhelp@ucar.edu and ask for permission to use the +create_resv_from_job feature of PBS. ** \ No newline at end of file From fb7dd2805ead92e9a1cb993fe942f059e305a619 Mon Sep 17 00:00:00 2001 From: Jim Edwards Date: Thu, 22 Jul 2021 07:53:47 -0600 Subject: [PATCH 15/15] cleanup example code --- tutorials/casper/Makefile | 20 +++++++------- tutorials/casper/README.md | 27 ++++++++++--------- .../casper/launch_database_cluster.template | 7 ++--- tutorials/casper/resv_job.template | 3 --- tutorials/casper/utils.py | 11 +++----- 5 files changed, 32 insertions(+), 36 deletions(-) diff --git a/tutorials/casper/Makefile b/tutorials/casper/Makefile index bf7d7ee7b..14caf1ff0 100644 --- a/tutorials/casper/Makefile +++ b/tutorials/casper/Makefile @@ -1,17 +1,19 @@ -REDIS_HOME = $(HOME)/sandboxes/SmartRedis -REDIS_SRC = $(REDIS_HOME)/src/fortran/client.F90 \ - $(REDIS_HOME)/src/fortran/dataset.F90 \ - $(REDIS_HOME)/src/fortran/fortran_c_interop.F90 +SMARTREDIS_FTN = $(SMARTREDIS_FSRC)/client.F90 \ + $(SMARTREDIS_FSRC)/dataset.F90 \ + $(SMARTREDIS_FSRC)/fortran_c_interop.F90 -REDIS_OBJ = client.o dataset.o fortran_c_interop.o +SMARTREDIS_OBJ = client.o dataset.o fortran_c_interop.o MPIFC = mpif90 -smartredis_put_get_3D: smartredis_put_get_3D.F90 $(REDIS_OBJ) - $(MPIFC) $< -o $@ $(REDIS_OBJ) -L$(REDIS_HOME)/install/lib -lsmartredis -Wl,-rpath $(REDIS_HOME)/install/lib +smartredis_put_get_3D: smartredis_put_get_3D.F90 $(SMARTREDIS_OBJ) + $(MPIFC) $< -o $@ $(SMARTREDIS_OBJ) -L$(SMARTREDIS_LIB) -lsmartredis -Wl,-rpath $(SMARTREDIS_LIB) -%.o : $(REDIS_HOME)/src/fortran/%.F90 - $(MPIFC) $< -c -o $@ -I $(REDIS_HOME)/install/include +%.o : $(SMARTREDIS_FSRC)/%.F90 + $(MPIFC) $< -c -o $@ -I $(SMARTREDIS_INCLUDE) client.o: dataset.o dataset.o: fortran_c_interop.o + +clean: + $(RM) *.o *.mod diff --git a/tutorials/casper/README.md b/tutorials/casper/README.md index e7da2842f..9957a9bad 100644 --- a/tutorials/casper/README.md +++ b/tutorials/casper/README.md @@ -1,9 +1,10 @@ - -# Casper +# Casper ```bash module purge -module load gnu/9.1.0 ncarcompilers openmpi netcdf ncarenv cmake +module use /glade/p/cesmdata/cseg/PROGS/modulefiles/CrayLabs +module load gnu ncarcompilers openmpi netcdf ncarenv cmake +module load SmartRedis ``` I also needed a newer version of gmake, it's in /glade/work/jedwards/make-4.3/bin/make @@ -24,23 +25,23 @@ First you need to build the smartredis_put_get_3D.F90 fortran example: make ``` -launch.py is the primary launch script +launch.py is the primary launch script ``` usage: launch.py [-h] [--db-nodes DB_NODES] [--ngpus-per-node NGPUS_PER_NODE] - [--walltime WALLTIME] [--ensemble-size ENSEMBLE_SIZE] - [--member-nodes MEMBER_NODES] [--account ACCOUNT] - [--db-port DB_PORT] + [--walltime WALLTIME] [--ensemble-size ENSEMBLE_SIZE] + [--member-nodes MEMBER_NODES] [--account ACCOUNT] + [--db-port DB_PORT] optional arguments: -h, --help show this help message and exit --db-nodes DB_NODES Number of nodes for the SmartSim database, default=1 --ngpus-per-node NGPUS_PER_NODE - Number of gpus per SmartSim database node, default=0 + Number of gpus per SmartSim database node, default=0 --walltime WALLTIME Total walltime for submitted job, default=00:30:00 --ensemble-size ENSEMBLE_SIZE - Number of ensemble members to run, default=1 + Number of ensemble members to run, default=1 --member-nodes MEMBER_NODES - Number of nodes per ensemble member, default=1 + Number of nodes per ensemble member, default=1 --account ACCOUNT Account ID --db-port DB_PORT db port, default=6780 ``` @@ -51,7 +52,9 @@ It creates pbs jobs from each of the 3 templates and submits the resv_job.sh which in turn will create a reservation large enough for the db and all the ensemble members. It submits those jobs in the newly created reservation. It starts the database and sets the SSDB environment variable -then launchs each of the clients, all of this is done within the newly created reservation. The database job monitors progress of the clients and exits and removes the reservation when it is complete. +then launchs each of the clients, all of this is done within the newly created reservation. The database job monitors progress of the clients and exits and removes the reservation when it is complete. + +Note that this launches the database and client jobs separately - The prefered method is to launch the client through SmartSim. ** Currently to use this feature you must first send a note to cislhelp@ucar.edu and ask for permission to use the -create_resv_from_job feature of PBS. ** \ No newline at end of file +create_resv_from_job feature of PBS. ** \ No newline at end of file diff --git a/tutorials/casper/launch_database_cluster.template b/tutorials/casper/launch_database_cluster.template index 10f47018c..d8f75f7dc 100644 --- a/tutorials/casper/launch_database_cluster.template +++ b/tutorials/casper/launch_database_cluster.template @@ -25,9 +25,6 @@ from smartsim.database import PBSOrchestrator Launch a distributed, in memory database cluster and use the SmartRedis python client to send and recieve some numpy arrays. -This example runs in an interactive allocation with at least three -nodes and 1 processor per node. - i.e. qsub -l select=3:ncpus=1 -l walltime=00:10:00 -A -q premium -I """ @@ -44,7 +41,7 @@ def collect_db_hosts(num_hosts): host = line.split(".")[0] hosts.append(host) else: - raise Exception("could not parse interactive allocation nodes from PBS_NODEFILE") + raise Exception("could not parse allocation nodes from PBS_NODEFILE") # account for mpiprocs causing repeats in PBS_NODEFILE hosts = list(set(hosts)) @@ -67,7 +64,7 @@ def launch_cluster_orc(exp, db_hosts, port): # pass in objects to make dirs for exp.generate(db, overwrite=True) - # start the database on interactive allocation + # start the database within the reservation allocation exp.start(db, block=True) # get the status of the database diff --git a/tutorials/casper/resv_job.template b/tutorials/casper/resv_job.template index 66d6a5626..ef8baf621 100644 --- a/tutorials/casper/resv_job.template +++ b/tutorials/casper/resv_job.template @@ -30,9 +30,6 @@ SSDB="$(getent hosts ${head_host}-ib|awk '{print $1}'):$db_port" # This gets the external network #SSDB="$(getent hosts ${head_host}.ucar.edu |awk '{print $1}'):$db_port" export SSDB -#./xmlchange JOB_QUEUE=$rsvname --subgroup case.test --force -#./xmlchange JOB_WALLCLOCK_TIME=00:20:00 --subgroup case.test -#./case.submit for i in `seq 1 $ensemble_size`; do client_id=$(qsub -q $rsvname -v SSDB ./launch_client.sh) diff --git a/tutorials/casper/utils.py b/tutorials/casper/utils.py index faf24b609..c822d594a 100644 --- a/tutorials/casper/utils.py +++ b/tutorials/casper/utils.py @@ -5,10 +5,9 @@ def _convert_to_fd(filearg, from_dir, mode="a"): return open(filearg, mode) -_hack=object() def run_cmd(cmd, input_str=None, from_dir=None, verbose=None, - arg_stdout=_hack, arg_stderr=_hack, env=None, + arg_stdout=subprocess.PIPE, arg_stderr=subprocess.PIPE, env=None, combine_output=False, timeout=None, executable=None): """ Wrapper around subprocess to make it much more convenient to run shell commands @@ -18,13 +17,11 @@ def run_cmd(cmd, input_str=None, from_dir=None, verbose=None, """ # Real defaults for these value should be subprocess.PIPE - if arg_stdout is _hack: - arg_stdout = subprocess.PIPE - elif isinstance(arg_stdout, str): + if isinstance(arg_stdout, str): arg_stdout = _convert_to_fd(arg_stdout, from_dir) - if arg_stderr is _hack: - arg_stderr = subprocess.STDOUT if combine_output else subprocess.PIPE + if combine_output: + arg_stderr = subprocess.STDOUT elif isinstance(arg_stderr, str): arg_stderr = _convert_to_fd(arg_stdout, from_dir)