From 8fb1ff9355abc47e66171ee776daa6d15d18e28d Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Tue, 7 Nov 2023 13:21:47 -0800 Subject: [PATCH 1/4] added initial integration between dyad and dlio_benchmark --- tests/__init__.py | 0 tests/integration/__init__.py | 0 tests/integration/dlio_benchmark/__init__.py | 0 .../configs/workload/dyad_unet3d.yaml | 38 ++++++ .../dlio_benchmark/dyad_torch_data_loader.py | 125 ++++++++++++++++++ .../dlio_benchmark/requirements.txt | 1 + tests/integration/dlio_benchmark/script.sh | 19 +++ 7 files changed, 183 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/integration/dlio_benchmark/__init__.py create mode 100644 tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml create mode 100644 tests/integration/dlio_benchmark/dyad_torch_data_loader.py create mode 100644 tests/integration/dlio_benchmark/requirements.txt create mode 100755 tests/integration/dlio_benchmark/script.sh diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/dlio_benchmark/__init__.py b/tests/integration/dlio_benchmark/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml b/tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml new file mode 100644 index 00000000..4403c7ec --- /dev/null +++ b/tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml @@ -0,0 +1,38 @@ +model: unet3d + +framework: pytorch + +workflow: + generate_data: False + train: True + evaluation: True + profiling: False + +dataset: + data_folder: data/dyad_unet3d + format: npz + num_files_train: 1 + num_files_eval: 1 + num_samples_per_file: 1 + record_length: 4096 + num_subfolders_train: 2 + num_subfolders_eval: 2 + +reader: + data_loader: pytorch + batch_size: 1 + batch_size_eval: 1 + data_loader_classname: dyad_torch_data_loader.DyadTorchDataLoader + data_loader_sampler: index + +train: + epochs: 10 + computation_time: 1.00 + + +evaluation: + eval_time: 0.5 + epochs_between_evals: 1 + +profiling: + profiler: iostat \ No newline at end of file diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py new file mode 100644 index 00000000..174f062c --- /dev/null +++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py @@ -0,0 +1,125 @@ +from time import time +import logging +import math +import torch +from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler + +from dlio_benchmark.common.constants import MODULE_DATA_LOADER +from dlio_benchmark.common.enumerations import Shuffle, DatasetType, DataLoaderType +from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader +from dlio_benchmark.reader.reader_factory import ReaderFactory +from dlio_benchmark.utils.utility import utcnow, get_rank, Profile +from pydyad import Dyad, dyad_open +import numpy as np +dlp = Profile(MODULE_DATA_LOADER) +import os + +class DYADTorchDataset(Dataset): + """ + Currently, we only support loading one sample per file + TODO: support multiple samples per file + """ + @dlp.log_init + def __init__(self, format_type, dataset_type, epoch, num_samples, num_workers, batch_size): + self.format_type = format_type + self.dataset_type = dataset_type + self.epoch_number = epoch + self.num_samples = num_samples + self.reader = None + self.num_images_read = 0 + self.batch_size = batch_size + if num_workers == 0: + self.worker_init(-1) + + @dlp.log + def worker_init(self, worker_id): + logging.debug(f"{utcnow()} worker initialized {worker_id} with format {self.format_type}") + self.reader = ReaderFactory.get_reader(type=self.format_type, + dataset_type=self.dataset_type, + thread_index=worker_id, + epoch_number=self.epoch_number) + self.dyad_io = Dyad() + self.dyad_io.init() + self.conf = ConfigArguments.get_instance() + + @dlp.log + def __len__(self): + return self.num_samples + + @dlp.log + def __getitem__(self, image_idx): + self.num_images_read += 1 + step = int(math.ceil(self.num_images_read / self.batch_size)) + logging.info(f"{utcnow()} Rank {get_rank()} reading {image_idx} sample") + filename, sample_index = self._args.global_index_map[image_idx] + base_fname = os.path.basename(filename) + file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False) + if file_obj: + with dyad_open(base_fname, "rb") as f: + data = np.load(f) + else: + data = self.reader.read_index(image_idx, step) + with dyad_open(base_fname, "wb") as f: + np.save(f, data) + return data + +class DyadTorchDataLoader(BaseDataLoader): + @dlp.log_init + def __init__(self, format_type, dataset_type, epoch_number): + super().__init__(format_type, dataset_type, epoch_number, DataLoaderType.PYTORCH) + + @dlp.log + def read(self): + do_shuffle = True if self._args.sample_shuffle != Shuffle.OFF else False + num_samples = self._args.total_samples_train if self.dataset_type is DatasetType.TRAIN else self._args.total_samples_eval + batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval + dataset = DYADTorchDataset(self.format_type, self.dataset_type, self.epoch_number, num_samples, self._args.read_threads, batch_size) + if do_shuffle: + sampler = RandomSampler(dataset) + else: + sampler = SequentialSampler(dataset) + if self._args.read_threads > 1: + prefetch_factor = math.ceil(self._args.prefetch_size / self._args.read_threads) + else: + prefetch_factor = self._args.prefetch_size + if prefetch_factor > 0: + if self._args.my_rank == 0: + logging.debug( + f"{utcnow()} Prefetch size is {self._args.prefetch_size}; prefetch factor of {prefetch_factor} will be set to Torch DataLoader.") + else: + if self._args.my_rank == 0: + logging.debug( + f"{utcnow()} Prefetch size is 0; a default prefetch factor of 2 will be set to Torch DataLoader.") + logging.debug(f"{utcnow()} Setup dataloader with {self._args.read_threads} workers {torch.__version__}") + if torch.__version__ == '1.3.1': + self._dataset = DataLoader(dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=self._args.read_threads, + pin_memory=True, + drop_last=True, + worker_init_fn=dataset.worker_init) + else: + self._dataset = DataLoader(dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=self._args.read_threads, + pin_memory=True, + drop_last=True, + worker_init_fn=dataset.worker_init, + prefetch_factor=prefetch_factor if prefetch_factor > 0 else 2) # 2 is the default value + logging.debug(f"{utcnow()} Rank {self._args.my_rank} will read {len(self._dataset) * batch_size} files") + + # self._dataset.sampler.set_epoch(epoch_number) + + @dlp.log + def next(self): + super().next() + total = self._args.training_steps if self.dataset_type is DatasetType.TRAIN else self._args.eval_steps + logging.debug(f"{utcnow()} Rank {self._args.my_rank} should read {total} batches") + for batch in self._dataset: + yield batch + + @dlp.log + def finalize(self): + pass \ No newline at end of file diff --git a/tests/integration/dlio_benchmark/requirements.txt b/tests/integration/dlio_benchmark/requirements.txt new file mode 100644 index 00000000..56e6089b --- /dev/null +++ b/tests/integration/dlio_benchmark/requirements.txt @@ -0,0 +1 @@ +dlio_benchmark[dlio_profiler] @ git+https://github.com/argonne-lcf/dlio_benchmark.git \ No newline at end of file diff --git a/tests/integration/dlio_benchmark/script.sh b/tests/integration/dlio_benchmark/script.sh new file mode 100755 index 00000000..03413fd0 --- /dev/null +++ b/tests/integration/dlio_benchmark/script.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -x + +DLIO_ENV=$1 +DYAD_PROJECT_DIR=$2 +DLIO_ENV=/home/haridev/projects/dyad/venv +DYAD_PROJECT_DIR=/home/haridev/projects/dyad +SPACK_ENV=/home/haridev/projects/dyad/spack-env/.spack-env/view + +source ${DLIO_ENV}/bin/activate +export PYTHONPATH=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark:$PYTHONPATH +export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${SPACK_ENV}/lib:${SPACK_ENV}/lib64:${LD_LIBRARY_PATH} + +echo Generate Data +dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False + +echo Run Training +dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=False ++workload.workflow.train=True From edffef38c70c0b5a74c80d143c75276f9de28c81 Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Tue, 7 Nov 2023 14:23:28 -0800 Subject: [PATCH 2/4] add environment --- tests/integration/dlio_benchmark/dyad_torch_data_loader.py | 5 +++-- tests/integration/dlio_benchmark/script.sh | 7 ++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py index 174f062c..30fc253a 100644 --- a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py +++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py @@ -9,6 +9,7 @@ from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader from dlio_benchmark.reader.reader_factory import ReaderFactory from dlio_benchmark.utils.utility import utcnow, get_rank, Profile +from dlio_benchmark.utils.config import ConfigArguments from pydyad import Dyad, dyad_open import numpy as np dlp = Profile(MODULE_DATA_LOADER) @@ -39,7 +40,7 @@ def worker_init(self, worker_id): thread_index=worker_id, epoch_number=self.epoch_number) self.dyad_io = Dyad() - self.dyad_io.init() + self.dyad_io.init_env() self.conf = ConfigArguments.get_instance() @dlp.log @@ -51,7 +52,7 @@ def __getitem__(self, image_idx): self.num_images_read += 1 step = int(math.ceil(self.num_images_read / self.batch_size)) logging.info(f"{utcnow()} Rank {get_rank()} reading {image_idx} sample") - filename, sample_index = self._args.global_index_map[image_idx] + filename, sample_index = self.conf.global_index_map[image_idx] base_fname = os.path.basename(filename) file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False) if file_obj: diff --git a/tests/integration/dlio_benchmark/script.sh b/tests/integration/dlio_benchmark/script.sh index 03413fd0..354ac902 100755 --- a/tests/integration/dlio_benchmark/script.sh +++ b/tests/integration/dlio_benchmark/script.sh @@ -6,11 +6,16 @@ DLIO_ENV=$1 DYAD_PROJECT_DIR=$2 DLIO_ENV=/home/haridev/projects/dyad/venv DYAD_PROJECT_DIR=/home/haridev/projects/dyad -SPACK_ENV=/home/haridev/projects/dyad/spack-env/.spack-env/view source ${DLIO_ENV}/bin/activate export PYTHONPATH=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark:$PYTHONPATH export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${SPACK_ENV}/lib:${SPACK_ENV}/lib64:${LD_LIBRARY_PATH} +export DYAD_KVS_NAMESPACE=test +export DYAD_DTL_MODE=UCX +export DYAD_PATH=$PWD/dyad +export DYAD_PATH_PRODUCER=$DYAD_PATH +export DYAD_PATH_CONSUMER=$DYAD_PATH +mkdir -p $DYAD_PATH echo Generate Data dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False From 983821c1d5b9ad0b2b04125435e82c8d45262432 Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Tue, 7 Nov 2023 14:36:38 -0800 Subject: [PATCH 3/4] pass ctx to dyad_open --- tests/integration/dlio_benchmark/dyad_torch_data_loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py index 30fc253a..486ddc2c 100644 --- a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py +++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py @@ -56,11 +56,11 @@ def __getitem__(self, image_idx): base_fname = os.path.basename(filename) file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False) if file_obj: - with dyad_open(base_fname, "rb") as f: + with dyad_open(base_fname, "rb", dyad_ctx=self.dyad_io) as f: data = np.load(f) else: data = self.reader.read_index(image_idx, step) - with dyad_open(base_fname, "wb") as f: + with dyad_open(base_fname, "wb", dyad_ctx=self.dyad_io) as f: np.save(f, data) return data From 4a8b6abf1626b6c35015dff8fa13f93632112c84 Mon Sep 17 00:00:00 2001 From: Hariharan Devarajan Date: Tue, 7 Nov 2023 14:36:38 -0800 Subject: [PATCH 4/4] fix scripts and CI --- .github/workflows/compile_test.yaml | 1 + tests/integration/dlio_benchmark/dyad_torch_data_loader.py | 4 ++-- tests/integration/dlio_benchmark/script.sh | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/compile_test.yaml b/.github/workflows/compile_test.yaml index a24cc643..d15e8ee6 100644 --- a/.github/workflows/compile_test.yaml +++ b/.github/workflows/compile_test.yaml @@ -34,6 +34,7 @@ jobs: ref: ${{ github.event.pull_request.head.sha }} - name: Install system deps run: | + sudo apt-get update sudo apt-get install -y --no-install-recommends \ gcc g++ gfortran \ autoconf \ diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py index 30fc253a..486ddc2c 100644 --- a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py +++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py @@ -56,11 +56,11 @@ def __getitem__(self, image_idx): base_fname = os.path.basename(filename) file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False) if file_obj: - with dyad_open(base_fname, "rb") as f: + with dyad_open(base_fname, "rb", dyad_ctx=self.dyad_io) as f: data = np.load(f) else: data = self.reader.read_index(image_idx, step) - with dyad_open(base_fname, "wb") as f: + with dyad_open(base_fname, "wb", dyad_ctx=self.dyad_io) as f: np.save(f, data) return data diff --git a/tests/integration/dlio_benchmark/script.sh b/tests/integration/dlio_benchmark/script.sh index 354ac902..1236e27a 100755 --- a/tests/integration/dlio_benchmark/script.sh +++ b/tests/integration/dlio_benchmark/script.sh @@ -9,7 +9,7 @@ DYAD_PROJECT_DIR=/home/haridev/projects/dyad source ${DLIO_ENV}/bin/activate export PYTHONPATH=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark:$PYTHONPATH -export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${SPACK_ENV}/lib:${SPACK_ENV}/lib64:${LD_LIBRARY_PATH} +export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${LD_LIBRARY_PATH} export DYAD_KVS_NAMESPACE=test export DYAD_DTL_MODE=UCX export DYAD_PATH=$PWD/dyad @@ -17,6 +17,9 @@ export DYAD_PATH_PRODUCER=$DYAD_PATH export DYAD_PATH_CONSUMER=$DYAD_PATH mkdir -p $DYAD_PATH +echo flux module load ${DLIO_ENV}/lib/dyad.so $DYAD_PATH_PRODUCER $DYAD_DTL_MODE +flux module load ${DLIO_ENV}/lib/dyad.so $DYAD_PATH_PRODUCER $DYAD_DTL_MODE + echo Generate Data dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False