From 8fb1ff9355abc47e66171ee776daa6d15d18e28d Mon Sep 17 00:00:00 2001
From: Hariharan Devarajan <mani.hariharan@gmail.com>
Date: Tue, 7 Nov 2023 13:21:47 -0800
Subject: [PATCH 1/4] added initial integration between dyad and dlio_benchmark

---
 tests/__init__.py                             |   0
 tests/integration/__init__.py                 |   0
 tests/integration/dlio_benchmark/__init__.py  |   0
 .../configs/workload/dyad_unet3d.yaml         |  38 ++++++
 .../dlio_benchmark/dyad_torch_data_loader.py  | 125 ++++++++++++++++++
 .../dlio_benchmark/requirements.txt           |   1 +
 tests/integration/dlio_benchmark/script.sh    |  19 +++
 7 files changed, 183 insertions(+)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/integration/dlio_benchmark/__init__.py
 create mode 100644 tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml
 create mode 100644 tests/integration/dlio_benchmark/dyad_torch_data_loader.py
 create mode 100644 tests/integration/dlio_benchmark/requirements.txt
 create mode 100755 tests/integration/dlio_benchmark/script.sh

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/dlio_benchmark/__init__.py b/tests/integration/dlio_benchmark/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml b/tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml
new file mode 100644
index 00000000..4403c7ec
--- /dev/null
+++ b/tests/integration/dlio_benchmark/configs/workload/dyad_unet3d.yaml
@@ -0,0 +1,38 @@
+model: unet3d
+
+framework: pytorch
+
+workflow:
+  generate_data: False
+  train: True
+  evaluation: True
+  profiling: False
+
+dataset:
+  data_folder: data/dyad_unet3d
+  format: npz
+  num_files_train: 1
+  num_files_eval: 1
+  num_samples_per_file: 1
+  record_length: 4096
+  num_subfolders_train: 2
+  num_subfolders_eval: 2
+
+reader:
+  data_loader: pytorch
+  batch_size: 1
+  batch_size_eval: 1
+  data_loader_classname: dyad_torch_data_loader.DyadTorchDataLoader
+  data_loader_sampler: index
+
+train:
+  epochs: 10
+  computation_time: 1.00
+
+
+evaluation:
+  eval_time: 0.5
+  epochs_between_evals: 1
+
+profiling:
+  profiler: iostat
\ No newline at end of file
diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
new file mode 100644
index 00000000..174f062c
--- /dev/null
+++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
@@ -0,0 +1,125 @@
+from time import time
+import logging
+import math
+import torch
+from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+
+from dlio_benchmark.common.constants import MODULE_DATA_LOADER
+from dlio_benchmark.common.enumerations import Shuffle, DatasetType, DataLoaderType
+from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
+from dlio_benchmark.reader.reader_factory import ReaderFactory
+from dlio_benchmark.utils.utility import utcnow, get_rank, Profile
+from pydyad import Dyad, dyad_open
+import numpy as np
+dlp = Profile(MODULE_DATA_LOADER)
+import os
+
+class DYADTorchDataset(Dataset):
+    """
+    Currently, we only support loading one sample per file
+    TODO: support multiple samples per file
+    """
+    @dlp.log_init
+    def __init__(self, format_type, dataset_type, epoch, num_samples, num_workers, batch_size):
+        self.format_type = format_type
+        self.dataset_type = dataset_type
+        self.epoch_number = epoch
+        self.num_samples = num_samples
+        self.reader = None
+        self.num_images_read = 0
+        self.batch_size = batch_size
+        if num_workers == 0:
+            self.worker_init(-1)
+
+    @dlp.log
+    def worker_init(self, worker_id):
+        logging.debug(f"{utcnow()} worker initialized {worker_id} with format {self.format_type}")
+        self.reader = ReaderFactory.get_reader(type=self.format_type,
+                                               dataset_type=self.dataset_type,
+                                               thread_index=worker_id,
+                                               epoch_number=self.epoch_number)
+        self.dyad_io = Dyad()
+        self.dyad_io.init()
+        self.conf = ConfigArguments.get_instance()
+
+    @dlp.log
+    def __len__(self):
+        return self.num_samples
+
+    @dlp.log
+    def __getitem__(self, image_idx):
+        self.num_images_read += 1
+        step = int(math.ceil(self.num_images_read / self.batch_size))
+        logging.info(f"{utcnow()} Rank {get_rank()} reading {image_idx} sample")
+        filename, sample_index = self._args.global_index_map[image_idx]
+        base_fname = os.path.basename(filename)
+        file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False)
+        if file_obj:
+            with dyad_open(base_fname, "rb") as f:
+                data = np.load(f)
+        else:
+            data = self.reader.read_index(image_idx, step)
+            with dyad_open(base_fname, "wb") as f:
+                np.save(f, data)
+        return data
+
+class DyadTorchDataLoader(BaseDataLoader):
+    @dlp.log_init
+    def __init__(self, format_type, dataset_type, epoch_number):
+        super().__init__(format_type, dataset_type, epoch_number, DataLoaderType.PYTORCH)
+
+    @dlp.log
+    def read(self):
+        do_shuffle = True if self._args.sample_shuffle != Shuffle.OFF else False
+        num_samples = self._args.total_samples_train if self.dataset_type is DatasetType.TRAIN else self._args.total_samples_eval
+        batch_size = self._args.batch_size if self.dataset_type is DatasetType.TRAIN else self._args.batch_size_eval
+        dataset = DYADTorchDataset(self.format_type, self.dataset_type, self.epoch_number, num_samples, self._args.read_threads, batch_size)
+        if do_shuffle:
+            sampler = RandomSampler(dataset)
+        else:
+            sampler = SequentialSampler(dataset)
+        if self._args.read_threads > 1:
+            prefetch_factor = math.ceil(self._args.prefetch_size / self._args.read_threads)
+        else:
+            prefetch_factor = self._args.prefetch_size
+        if prefetch_factor > 0:
+            if self._args.my_rank == 0:
+                logging.debug(
+                    f"{utcnow()} Prefetch size is {self._args.prefetch_size}; prefetch factor of {prefetch_factor} will be set to Torch DataLoader.")
+        else:
+            if self._args.my_rank == 0:
+                logging.debug(
+                    f"{utcnow()} Prefetch size is 0; a default prefetch factor of 2 will be set to Torch DataLoader.")
+        logging.debug(f"{utcnow()} Setup dataloader with {self._args.read_threads} workers {torch.__version__}")
+        if torch.__version__ == '1.3.1':
+            self._dataset = DataLoader(dataset,
+                                       batch_size=batch_size,
+                                       sampler=sampler,
+                                       num_workers=self._args.read_threads,
+                                       pin_memory=True,
+                                       drop_last=True,
+                                       worker_init_fn=dataset.worker_init)
+        else:
+            self._dataset = DataLoader(dataset,
+                                       batch_size=batch_size,
+                                       sampler=sampler,
+                                       num_workers=self._args.read_threads,
+                                       pin_memory=True,
+                                       drop_last=True,
+                                       worker_init_fn=dataset.worker_init,
+                                       prefetch_factor=prefetch_factor if prefetch_factor > 0 else 2)  # 2 is the default value
+        logging.debug(f"{utcnow()} Rank {self._args.my_rank} will read {len(self._dataset) * batch_size} files")
+
+        # self._dataset.sampler.set_epoch(epoch_number)
+
+    @dlp.log
+    def next(self):
+        super().next()
+        total = self._args.training_steps if self.dataset_type is DatasetType.TRAIN else self._args.eval_steps
+        logging.debug(f"{utcnow()} Rank {self._args.my_rank} should read {total} batches")
+        for batch in self._dataset:
+            yield batch
+
+    @dlp.log
+    def finalize(self):
+        pass
\ No newline at end of file
diff --git a/tests/integration/dlio_benchmark/requirements.txt b/tests/integration/dlio_benchmark/requirements.txt
new file mode 100644
index 00000000..56e6089b
--- /dev/null
+++ b/tests/integration/dlio_benchmark/requirements.txt
@@ -0,0 +1 @@
+dlio_benchmark[dlio_profiler] @ git+https://github.com/argonne-lcf/dlio_benchmark.git
\ No newline at end of file
diff --git a/tests/integration/dlio_benchmark/script.sh b/tests/integration/dlio_benchmark/script.sh
new file mode 100755
index 00000000..03413fd0
--- /dev/null
+++ b/tests/integration/dlio_benchmark/script.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -x
+
+DLIO_ENV=$1
+DYAD_PROJECT_DIR=$2
+DLIO_ENV=/home/haridev/projects/dyad/venv
+DYAD_PROJECT_DIR=/home/haridev/projects/dyad
+SPACK_ENV=/home/haridev/projects/dyad/spack-env/.spack-env/view
+
+source ${DLIO_ENV}/bin/activate
+export PYTHONPATH=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark:$PYTHONPATH
+export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${SPACK_ENV}/lib:${SPACK_ENV}/lib64:${LD_LIBRARY_PATH}
+
+echo Generate Data
+dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False
+
+echo Run Training
+dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=False ++workload.workflow.train=True

From edffef38c70c0b5a74c80d143c75276f9de28c81 Mon Sep 17 00:00:00 2001
From: Hariharan Devarajan <mani.hariharan@gmail.com>
Date: Tue, 7 Nov 2023 14:23:28 -0800
Subject: [PATCH 2/4] add environment

---
 tests/integration/dlio_benchmark/dyad_torch_data_loader.py | 5 +++--
 tests/integration/dlio_benchmark/script.sh                 | 7 ++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
index 174f062c..30fc253a 100644
--- a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
+++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
@@ -9,6 +9,7 @@
 from dlio_benchmark.data_loader.base_data_loader import BaseDataLoader
 from dlio_benchmark.reader.reader_factory import ReaderFactory
 from dlio_benchmark.utils.utility import utcnow, get_rank, Profile
+from dlio_benchmark.utils.config import ConfigArguments
 from pydyad import Dyad, dyad_open
 import numpy as np
 dlp = Profile(MODULE_DATA_LOADER)
@@ -39,7 +40,7 @@ def worker_init(self, worker_id):
                                                thread_index=worker_id,
                                                epoch_number=self.epoch_number)
         self.dyad_io = Dyad()
-        self.dyad_io.init()
+        self.dyad_io.init_env()
         self.conf = ConfigArguments.get_instance()
 
     @dlp.log
@@ -51,7 +52,7 @@ def __getitem__(self, image_idx):
         self.num_images_read += 1
         step = int(math.ceil(self.num_images_read / self.batch_size))
         logging.info(f"{utcnow()} Rank {get_rank()} reading {image_idx} sample")
-        filename, sample_index = self._args.global_index_map[image_idx]
+        filename, sample_index = self.conf.global_index_map[image_idx]
         base_fname = os.path.basename(filename)
         file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False)
         if file_obj:
diff --git a/tests/integration/dlio_benchmark/script.sh b/tests/integration/dlio_benchmark/script.sh
index 03413fd0..354ac902 100755
--- a/tests/integration/dlio_benchmark/script.sh
+++ b/tests/integration/dlio_benchmark/script.sh
@@ -6,11 +6,16 @@ DLIO_ENV=$1
 DYAD_PROJECT_DIR=$2
 DLIO_ENV=/home/haridev/projects/dyad/venv
 DYAD_PROJECT_DIR=/home/haridev/projects/dyad
-SPACK_ENV=/home/haridev/projects/dyad/spack-env/.spack-env/view
 
 source ${DLIO_ENV}/bin/activate
 export PYTHONPATH=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark:$PYTHONPATH
 export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${SPACK_ENV}/lib:${SPACK_ENV}/lib64:${LD_LIBRARY_PATH}
+export DYAD_KVS_NAMESPACE=test
+export DYAD_DTL_MODE=UCX
+export DYAD_PATH=$PWD/dyad
+export DYAD_PATH_PRODUCER=$DYAD_PATH
+export DYAD_PATH_CONSUMER=$DYAD_PATH
+mkdir -p $DYAD_PATH
 
 echo Generate Data
 dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False

From 983821c1d5b9ad0b2b04125435e82c8d45262432 Mon Sep 17 00:00:00 2001
From: Hariharan Devarajan <mani.hariharan@gmail.com>
Date: Tue, 7 Nov 2023 14:36:38 -0800
Subject: [PATCH 3/4] pass ctx to dyad_open

---
 tests/integration/dlio_benchmark/dyad_torch_data_loader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
index 30fc253a..486ddc2c 100644
--- a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
+++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
@@ -56,11 +56,11 @@ def __getitem__(self, image_idx):
         base_fname = os.path.basename(filename)
         file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False)
         if file_obj:
-            with dyad_open(base_fname, "rb") as f:
+            with dyad_open(base_fname, "rb", dyad_ctx=self.dyad_io) as f:
                 data = np.load(f)
         else:
             data = self.reader.read_index(image_idx, step)
-            with dyad_open(base_fname, "wb") as f:
+            with dyad_open(base_fname, "wb", dyad_ctx=self.dyad_io) as f:
                 np.save(f, data)
         return data
 

From 4a8b6abf1626b6c35015dff8fa13f93632112c84 Mon Sep 17 00:00:00 2001
From: Hariharan Devarajan <mani.hariharan@gmail.com>
Date: Tue, 7 Nov 2023 14:36:38 -0800
Subject: [PATCH 4/4] fix scripts and CI

---
 .github/workflows/compile_test.yaml                        | 1 +
 tests/integration/dlio_benchmark/dyad_torch_data_loader.py | 4 ++--
 tests/integration/dlio_benchmark/script.sh                 | 5 ++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/compile_test.yaml b/.github/workflows/compile_test.yaml
index a24cc643..d15e8ee6 100644
--- a/.github/workflows/compile_test.yaml
+++ b/.github/workflows/compile_test.yaml
@@ -34,6 +34,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
       - name: Install system deps
         run: |
+          sudo apt-get update
           sudo apt-get install -y --no-install-recommends  \
                               gcc g++ gfortran \
                               autoconf \
diff --git a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
index 30fc253a..486ddc2c 100644
--- a/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
+++ b/tests/integration/dlio_benchmark/dyad_torch_data_loader.py
@@ -56,11 +56,11 @@ def __getitem__(self, image_idx):
         base_fname = os.path.basename(filename)
         file_obj = self.dyad_io.get_metadata(fname=base_fname, should_wait=False)
         if file_obj:
-            with dyad_open(base_fname, "rb") as f:
+            with dyad_open(base_fname, "rb", dyad_ctx=self.dyad_io) as f:
                 data = np.load(f)
         else:
             data = self.reader.read_index(image_idx, step)
-            with dyad_open(base_fname, "wb") as f:
+            with dyad_open(base_fname, "wb", dyad_ctx=self.dyad_io) as f:
                 np.save(f, data)
         return data
 
diff --git a/tests/integration/dlio_benchmark/script.sh b/tests/integration/dlio_benchmark/script.sh
index 354ac902..1236e27a 100755
--- a/tests/integration/dlio_benchmark/script.sh
+++ b/tests/integration/dlio_benchmark/script.sh
@@ -9,7 +9,7 @@ DYAD_PROJECT_DIR=/home/haridev/projects/dyad
 
 source ${DLIO_ENV}/bin/activate
 export PYTHONPATH=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark:$PYTHONPATH
-export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${SPACK_ENV}/lib:${SPACK_ENV}/lib64:${LD_LIBRARY_PATH}
+export LD_LIBRARY_PATH=${DLIO_ENV}/lib:${DLIO_ENV}/lib64:${LD_LIBRARY_PATH}
 export DYAD_KVS_NAMESPACE=test
 export DYAD_DTL_MODE=UCX
 export DYAD_PATH=$PWD/dyad
@@ -17,6 +17,9 @@ export DYAD_PATH_PRODUCER=$DYAD_PATH
 export DYAD_PATH_CONSUMER=$DYAD_PATH
 mkdir -p $DYAD_PATH
 
+echo flux module load ${DLIO_ENV}/lib/dyad.so  $DYAD_PATH_PRODUCER $DYAD_DTL_MODE
+flux module load ${DLIO_ENV}/lib/dyad.so  $DYAD_PATH_PRODUCER $DYAD_DTL_MODE
+
 echo Generate Data
 dlio_benchmark --config-dir=${DYAD_PROJECT_DIR}/tests/integration/dlio_benchmark/configs workload=dyad_unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False