diff --git a/tests/reframe/config/settings.py b/tests/reframe/config/settings.py new file mode 100644 index 0000000000..d3d1c90ad5 --- /dev/null +++ b/tests/reframe/config/settings.py @@ -0,0 +1,117 @@ +site_configuration = { + 'systems': [ + { + 'name': 'Example_system', + 'descr': 'This is just an example system', + 'modules_system': 'tmod', + 'hostnames': ['login', 'int'], + 'partitions': [ + { + 'name': 'short', + 'scheduler': 'slurm', + 'launcher': 'srun', + 'access': ['-p short'], + 'environs': ['foss', 'container'], + 'container_platforms': [ + { + 'type': 'Singularity', + 'modules': [], + 'variables': [['SLURM_MPI_TYPE', 'pmix']] + } + ], + 'processor': { + 'num_cpus': 24, + }, + 'devices': [ + { + 'type': 'gpu', + 'num_devices': 2, + }, + ], + 'descr': 'normal partition' + }, + { + 'name': 'gpu_short', + 'scheduler': 'slurm', + 'launcher': 'srun', + 'access': ['-p gpu_short'], + 'environs': ['fosscuda', 'container'], + 'container_platforms': [ + { + 'type': 'Singularity', + 'modules': [], + 'variables': [['SLURM_MPI_TYPE', 'pmix']] + } + ], + 'processor': { + 'num_cpus': 16, + }, + 'devices': [ + { + 'type': 'gpu', + 'num_devices': 2, + }, + ], + 'descr': 'gpu partition' + }, + ] + }, + ], + 'environments': [ + { + 'name': 'foss', + 'modules': ['foss/2020a'], + 'cc': 'mpicc', + 'cxx': 'mpicxx', + 'ftn': 'mpifort', + }, + { + 'name': 'fosscuda', + 'modules': ['fosscuda/2020a'], + 'cc': 'mpicc', + 'cxx': 'mpicxx', + 'ftn': 'mpifort', + }, + { + 'name': 'container', + 'modules': [], + }, + ], + 'logging': [ + { + 'level': 'debug', + 'handlers': [ + { + 'type': 'stream', + 'name': 'stdout', + 'level': 'info', + 'format': '%(message)s' + }, + { + 'type': 'file', + 'name': 'reframe.log', + 'level': 'debug', + 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 + 'append': False + } + ], + 'handlers_perflog': [ + { + 'type': 'filelog', + 'prefix': '%(check_system)s/%(check_partition)s', + 'level': 'info', + 'format': ( + '%(check_job_completion_time)s|reframe %(version)s|' + '%(check_info)s|jobid=%(check_jobid)s|' + '%(check_perf_var)s=%(check_perf_value)s|' + 'ref=%(check_perf_ref)s ' + '(l=%(check_perf_lower_thres)s, ' + 'u=%(check_perf_upper_thres)s)|' + '%(check_perf_unit)s' + ), + 'append': True + } + ] + } + ], +} diff --git a/tests/reframe/config/system_properties.py b/tests/reframe/config/system_properties.py new file mode 100644 index 0000000000..b27bc7c56c --- /dev/null +++ b/tests/reframe/config/system_properties.py @@ -0,0 +1,2 @@ + +ncorespernode=16 diff --git a/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py b/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py new file mode 100644 index 0000000000..827b84a3ef --- /dev/null +++ b/tests/reframe/eessi-checks/applications/src/tensorflow2_synthetic_benchmark.py @@ -0,0 +1,158 @@ +# Copyright 2019 Uber Technologies, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +from __future__ import absolute_import, division, print_function + +import argparse +import os +import numpy as np +import timeit + +import tensorflow as tf +from tensorflow.keras import applications +from tensorflow.keras import mixed_precision + +def log(s, nl=True): + if args.use_horovod: + if hvd.rank() != 0: + return + print(s, end='\n' if nl else '') + +# Benchmark settings +parser = argparse.ArgumentParser(description='TensorFlow Synthetic Benchmark', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--fp16-allreduce', action='store_true', default=False, + help='use fp16 compression during allreduce') +parser.add_argument('--mixed-prec', action='store_true', default=False, + help='Use mixed precision for training') + +parser.add_argument('--model', type=str, default='ResNet50', + help='model to benchmark') +parser.add_argument('--batch-size', type=int, default=32, + help='input batch size') + +parser.add_argument('--num-warmup-batches', type=int, default=2, + help='number of warm-up batches that don\'t count towards benchmark') +parser.add_argument('--num-batches-per-iter', type=int, default=10, + help='number of batches per benchmark iteration') +parser.add_argument('--num-iters', type=int, default=10, + help='number of benchmark iterations') + +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') +parser.add_argument('--use-horovod', action='store_true', default=False) + +args = parser.parse_args() +args.cuda = not args.no_cuda + +# Horovod: initialize Horovod. +if args.use_horovod: + import horovod.tensorflow as hvd + hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +if args.cuda: + gpus = tf.config.experimental.list_physical_devices('GPU') + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + if gpus and args.use_horovod: + tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') +else: + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + +if args.cuda: + tf.config.threading.set_inter_op_parallelism_threads(1) +else: + tf.config.threading.set_inter_op_parallelism_threads(1) + +tf.config.threading.set_intra_op_parallelism_threads(int(os.environ['OMP_NUM_THREADS'])) + +if args.mixed_prec: + log('Running with mixed_float16 as global policy for the precision') + mixed_precision.set_global_policy('mixed_float16') + +# Set up standard model. +model = getattr(applications, args.model)(weights=None) +opt = tf.optimizers.SGD(0.01) + +data = tf.random.uniform([args.batch_size, 224, 224, 3]) +target = tf.random.uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) + + +@tf.function +def benchmark_step(first_batch): + # Horovod: (optional) compression algorithm. + if args.use_horovod: + compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none + + # Horovod: use DistributedGradientTape + with tf.GradientTape() as tape: + probs = model(data, training=True) + loss = tf.losses.sparse_categorical_crossentropy(target, probs) + + # Horovod: add Horovod Distributed GradientTape. + if args.use_horovod: + tape = hvd.DistributedGradientTape(tape, compression=compression) + + gradients = tape.gradient(loss, model.trainable_variables) + opt.apply_gradients(zip(gradients, model.trainable_variables)) + + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + # + # Note: broadcast should be done after the first gradient step to ensure optimizer + # initialization. + if args.use_horovod and first_batch: + hvd.broadcast_variables(model.variables, root_rank=0) + hvd.broadcast_variables(opt.variables(), root_rank=0) + +log('Model: %s' % args.model) +log('Batch size: %d' % args.batch_size) +device = 'GPU' if args.cuda else 'CPU' +if args.use_horovod: + log('Number of %ss: %d' % (device, hvd.size())) +else: + log('Number of %ss: %s' % (device, 1)) + + +with tf.device(device): + # Warm-up + log('Running warmup...') + benchmark_step(first_batch=True) + + timeit.timeit(lambda: benchmark_step(first_batch=False), + number=args.num_warmup_batches) + + # Benchmark + log('Running benchmark...') + img_secs = [] + for x in range(args.num_iters): + time = timeit.timeit(lambda: benchmark_step(first_batch=False), + number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) + img_secs.append(img_sec) + + # Results + img_sec_mean = np.mean(img_secs) + img_sec_conf = 1.96 * np.std(img_secs) + log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) + if args.use_horovod: + ndevices = hvd.size() + else: + ndevices = 1 + log('Total img/sec on %d %s(s): %.1f +-%.1f' % + (ndevices, device, ndevices * img_sec_mean, ndevices * img_sec_conf)) + log('Benchmark completed') diff --git a/tests/reframe/eessi-checks/applications/tensorflow2.py b/tests/reframe/eessi-checks/applications/tensorflow2.py new file mode 100644 index 0000000000..01e2bc511b --- /dev/null +++ b/tests/reframe/eessi-checks/applications/tensorflow2.py @@ -0,0 +1,158 @@ +# This TensorFlow2 test is intended for single node, single GPU only +# For multigpu and multinode tests, we use Horovod + +import os +import reframe as rfm +import reframe.utility.sanity as sn + +class TensorFlow2Base(rfm.RunOnlyRegressionTest): + + device = parameter(['cpu', 'gpu']) + + def __init__(self): + self.valid_systems = ['*'] + + self.script = 'tensorflow2_synthetic_benchmark.py' + self.model = 'ResNet50' + self.batch_size = 32 + + self.sanity_patterns = sn.all([ + sn.assert_found('Benchmark completed', self.stdout), + ]) + + self.perf_patterns = { + 'throughput': sn.extractsingle( + rf'Total img\/sec on [0-9]+ {self.device.upper()}\(s\): ' + rf'(?P\S+) \S+', + self.stdout, 'throughput', float), + f'throughput_per_{self.device}': sn.extractsingle( + rf'Img\/sec per {self.device.upper()}: (?P\S+) \S+', + self.stdout, f'throughput_per_{self.device}', float) + } + self.reference = { + '*': { + 'throughput': (None, None, None, 'img/sec'), + f'throughput_per_{self.device}': (None, None, None, 'img/sec') + } + } + + self.tags = {f'{self.device}'} + + self.maintainers = ['casparvl'] + + +@rfm.simple_test +class TensorFlow2Native(TensorFlow2Base): + def __init__(self): + super().__init__() + + self.descr = 'TensorFlow 2.X single gpu test. Based on the Horovod tensorflow2_synthetic_benchmark.py example.' + + self.tags.add('native') + self.valid_prog_environs = ['*'] + + self.modules = ['TensorFlow'] + self.executable = 'python' + + self.executable_opts = [ + f'{self.script}', + f'--model {self.model}', + f'--batch-size {self.batch_size}', + '--num-iters 5', + '--num-batches-per-iter 5', + '--num-warmup-batches 5', + ] + if self.device == 'cpu': + self.executable_opts.append('--no-cuda') + + self.num_nodes = 1 + self.num_tasks_per_node = 1 + + self.tags.add('singlenode') + + # Set OMP_NUM_THREADS based on current partition properties + @rfm.run_before('run') + def set_num_threads(self): + self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node) + self.variables = { + 'OMP_NUM_THREADS': f'{self.num_cpus_per_task}', + } + if self.current_partition.launcher_type == 'mpirun': + self.job.launcher.options = ['-x OMP_NUM_THREADS'] + +class HorovodTensorFlow2Base(TensorFlow2Base): + + scale = parameter(['singlenode', 'small', 'large']) + + def __init__(self): + super().__init__() + + if self.scale == 'singlenode': + self.num_nodes = 1 + elif self.scale == 'small': + self.num_nodes = 4 + elif self.scale == 'large': + self.num_nodes = 10 + self.tags.add(self.scale) + + # Set number of tasks and threads (OMP_NUM_THREADS) based on current partition properties + @rfm.run_before('run') + def set_num_tasks(self): + # On CPU nodes, start 1 task per node. On GPU nodes, start 1 task per GPU. + if self.device == 'cpu': + # For now, keep it simple. + # In the future, we may want to launch 1 task per socket, + # and bind these tasks to their respective sockets. + self.num_tasks_per_node = 1 + elif self.device == 'gpu': + device_count = [ dev.num_devices for dev in self.current_partition.devices if dev.device_type == 'gpu' ] + # This test doesn't know what to do if multiple DIFFERENT GPU devices are present in a single partition, so assert that we only found one in the ReFrame config: + assert(len(device_count) == 1) + self.num_tasks_per_node = device_count[0] + # On some resource schedules, you may need to request GPUs explicitely (e.g. --gpus-per-node=4). + # The extra_resources allows that to be put in the ReFrame settings file. + # See: https://reframe-hpc.readthedocs.io/en/stable/regression_test_api.html?highlight=num_gpus_per_node#reframe.core.pipeline.RegressionTest.extra_resources + # If the partition in the reframe settings file doesn't contain a resource with the name 'gpu', the self.extra_resources wil be ignored. + self.extra_resources = { + 'gpu': {'num_gpus_per_node': device_count[0]} + } + self.num_tasks = self.num_tasks_per_node * self.num_nodes + self.num_cpus_per_task = int(self.current_partition.processor.num_cpus / self.num_tasks_per_node) + # If test runs on CPU, leave one thread idle for Horovod. See https://github.com/horovod/horovod/issues/2804 + if self.device == 'cpu': + num_threads = max(self.num_cpus_per_task-1, 1) + elif self.device == 'gpu': + num_threads = self.num_cpus_per_task + self.variables = { + 'OMP_NUM_THREADS': f'{num_threads}', + } + if self.current_partition.launcher_type == 'mpirun': + self.job.launcher.options = ['-x OMP_NUM_THREADS'] + + +@rfm.simple_test +class HorovodTensorFlow2Native(HorovodTensorFlow2Base): + + def __init__(self): + super().__init__() + + self.descr = 'TensorFlow 2.X with Horovod multi-node and multi-GPU test. Based on the Horovod tensorflow2_synthetic_benchmark.py example.' + + self.tags.add('native') + self.valid_prog_environs = ['*'] + + self.modules = ['Horovod'] + self.executable = 'python' + + self.executable_opts = [ + f'{self.script}', + f'--model {self.model}', + f'--batch-size {self.batch_size}', + '--num-iters 5', + '--num-batches-per-iter 5', + '--num-warmup-batches 5', + '--use-horovod', + ] + if self.device == 'cpu': + self.executable_opts.append('--no-cuda') + diff --git a/tests/reframe/eessi-checks/prgenv/mpi.py b/tests/reframe/eessi-checks/prgenv/mpi.py new file mode 100644 index 0000000000..df1a06483c --- /dev/null +++ b/tests/reframe/eessi-checks/prgenv/mpi.py @@ -0,0 +1,20 @@ +import os +import reframe as rfm +import reframe.utility.sanity as sn + +# Try to use an import to define all site-specific things +# import system_properties + +@rfm.simple_test +class MpiHelloWorld(rfm.RegressionTest): + def __init__(self): + # We don't define these here to keep tests generic + # Sensible systems & programming environments should be defined in your site configuration file + self.valid_systems = ['*'] + self.valid_prog_environs = ['*'] + + self.sourcepath = 'mpi_hello_world.c' + self.maintainers = ['casparvl'] + self.num_tasks_per_node = -2 +# self.num_tasks_per_node = system_properties.ncorespernode + self.num_tasks_per_node = 16 diff --git a/tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c b/tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c new file mode 100644 index 0000000000..35458d5d68 --- /dev/null +++ b/tests/reframe/eessi-checks/prgenv/src/mpi_hello_world.c @@ -0,0 +1,43 @@ +#include +#include +#include + +#define MSG_SIZE_MAX 255 + + +int main(int argc, char **argv) +{ + const char *msg = "Hello, World!"; + char msg_buff[MSG_SIZE_MAX+1]; + size_t msg_len = strnlen(msg, MSG_SIZE_MAX); + int rank, num_tasks, i; + int dest = 0; + int tag = 0; + int nr_correct = 0; + MPI_Status status; + + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (num_tasks < 2) { + fprintf(stderr, "Not enough tasks to run the test.\n"); + MPI_Finalize(); + return 1; + } + + if (rank != 0) { + strncpy(msg_buff, msg, MSG_SIZE_MAX); + MPI_Send(msg_buff, msg_len+1, MPI_CHAR, dest, tag, MPI_COMM_WORLD); + } else { + for (i = 1; i < num_tasks; i++) { + MPI_Recv(msg_buff, msg_len+1, MPI_CHAR, + i, tag, MPI_COMM_WORLD, &status); + if (!strncmp(msg, msg_buff, MSG_SIZE_MAX)) + nr_correct++; + } + printf("Received correct messages from %d processes.\n", nr_correct); + } + + MPI_Finalize(); + return 0; +}