From a80222d3ae3289ba0f5fcf9c0ed309b1a761846f Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Tue, 6 Aug 2024 11:53:14 +0200
Subject: [PATCH 1/9] fix: update getting started example config

---
 examples/getting_started/example_config.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/getting_started/example_config.yaml b/examples/getting_started/example_config.yaml
index 0ba264c3..14f85dba 100644
--- a/examples/getting_started/example_config.yaml
+++ b/examples/getting_started/example_config.yaml
@@ -254,8 +254,7 @@ batch_progress_subscriber:
   component_key: progress_subscriber
   variant_key: rich
   config:
-    local_rank: ${settings.cuda_env.local_rank}
-    world_size: ${settings.cuda_env.world_size}
+    global_rank: ${settings.cuda_env.global_rank}
     global_num_seen_steps:
       component_key: number_conversion
       variant_key: num_steps_from_num_tokens
@@ -277,7 +276,7 @@ evaluation_subscriber:
   component_key: results_subscriber
   variant_key: wandb
   config:
-    local_rank: ${settings.cuda_env.local_rank}
+    global_rank: ${settings.cuda_env.global_rank}
     project: modalities_getting_started
     mode: OFFLINE
     experiment_id: ${settings.experiment_id}

From 11dfbfae7fde62a4eb09cd86afb2812f6c5e1619 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Tue, 6 Aug 2024 11:58:28 +0200
Subject: [PATCH 2/9] feat: add shell script to run complete getting started
 example

---
 .../run_getting_started_example.sh            | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 examples/getting_started/run_getting_started_example.sh

diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh
new file mode 100644
index 00000000..8cafe0ab
--- /dev/null
+++ b/examples/getting_started/run_getting_started_example.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+# ---------------------------------------------------
+# bash run_getting_started_example.sh 0,1,2,3,4,5,6,7
+# ---------------------------------------------------
+
+#######################
+### INPUT ARGUMENTS ###
+#######################
+if [ -z "$1" ]  # if input argument does not exist
+  then
+    echo "Need to specify the GPU devices as arguments, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
+    exit
+fi
+CUDA_VISIBLE_DEVICES=$1
+
+first_character=${1:0:1}
+if [[ $first_character =~ [^0-7] ]]   # if the first character is not an integer 0-7
+    then
+        echo "First character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
+        exit
+fi
+
+last_character=${1:0-1}
+if [[ $last_character =~ [^0-7] ]]   # if the first character is not an integer 0-7
+    then
+        echo "Last character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
+        exit
+fi
+
+#############
+### RUN #####
+#############
+echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES
+
+modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl
+modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl
+modalities data pack_encoded_data example_dataset_config_train.yaml
+modalities data pack_encoded_data example_dataset_config_test.yaml
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path example_config.yaml

From 6974d0c3c3b09693588aa78a2373a0ddb758479e Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Tue, 6 Aug 2024 12:06:12 +0200
Subject: [PATCH 3/9] chore: add checks to PR template

---
 .github/pull_request_template.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 4c04b72f..de2df236 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -13,4 +13,6 @@ This PR ..
 - [ ] I have merged the latest version of the target branch into this feature branch
 - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc.
 - [ ] I have run a sample config for model training
-- [ ] I have checked that all tests run through (`python tests/tests.py`)
\ No newline at end of file
+- [ ] I have run the getting started example (`bash run_getting_started_example.sh`)
+- [ ] I have checked that all tests run through (`python tests/tests.py`)
+- [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`)
\ No newline at end of file

From 54ea4c1cccf5b7734efa9bc2e71158d4f7ec2a73 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Tue, 6 Aug 2024 14:02:16 +0200
Subject: [PATCH 4/9] test: include getting started example in tests

---
 .github/pull_request_template.md |  1 -
 tests/run_distributed_tests.sh   |  2 +-
 tests/tests.py                   | 12 ++++++++++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index de2df236..7c211215 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -13,6 +13,5 @@ This PR ..
 - [ ] I have merged the latest version of the target branch into this feature branch
 - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc.
 - [ ] I have run a sample config for model training
-- [ ] I have run the getting started example (`bash run_getting_started_example.sh`)
 - [ ] I have checked that all tests run through (`python tests/tests.py`)
 - [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`)
\ No newline at end of file
diff --git a/tests/run_distributed_tests.sh b/tests/run_distributed_tests.sh
index ed1c49fc..71619df3 100644
--- a/tests/run_distributed_tests.sh
+++ b/tests/run_distributed_tests.sh
@@ -23,7 +23,7 @@ COVERAGE=--no-cov
 
 #############
 ### TESTS ###
-#################
+#############
 # test_fsdp_to_disc_checkpointing
 CUDA_VISIBLE_DEVICES=$DEV0,$DEV1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 $(which pytest) checkpointing/test_fsdp_to_disc_checkpointing.py $COVERAGE
 
diff --git a/tests/tests.py b/tests/tests.py
index ea565fb6..1412c8c5 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -51,6 +51,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
 
     # run multi-gpu tests
     if multi_gpu:
+        # distributed tests
         run_distributed_tests_directory = _ROOT_DIR / "tests"
         run_distributed_tests_script = _ROOT_DIR / "tests" / "run_distributed_tests.sh"
         assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist."
@@ -62,6 +63,17 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
         print(command_end_to_end_tests)
         subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True)
 
+        # getting started example
+        run_distributed_tests_directory = _ROOT_DIR / "examples" / "getting_started"
+        run_distributed_tests_script = _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh"
+        assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist."
+        command_getting_started_example = (
+            f"cd {run_distributed_tests_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}"
+        )
+        print("\n=== RUN GETTING STARTED EXAMPLE ===")
+        print(command_getting_started_example)
+        subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Description of your program")

From 8c783bbf1c9fda64ed707ae8ac5f6f1052227489 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Tue, 6 Aug 2024 17:29:07 +0200
Subject: [PATCH 5/9] test: include getting started example in tests (cleanup)

---
 tests/tests.py | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index 1412c8c5..77e99cbd 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,11 +1,28 @@
 import argparse
+import os
 import subprocess
-from os.path import isfile
+from os.path import isdir, isfile, join
 from pathlib import Path
 
 _ROOT_DIR = Path(__file__).parents[1]
 
 
+def clear_getting_started_example_output_directory(output_directory):
+    assert isdir(output_directory), f"ERROR! {output_directory} does not exist."
+    output_files = [
+        "redpajama_v2_samples_512_train.idx",
+        "redpajama_v2_samples_512_test.idx",
+        "redpajama_v2_samples_512_train.pbin",
+        "redpajama_v2_samples_512_test.pbin",
+    ]
+    print()
+    for output_file in output_files:
+        output_file_path = join(output_directory, output_file)
+        assert isfile(output_file_path), f"ERROR! {output_file_path} does not exist."
+        os.remove(output_file_path)
+        print(f"> removed {output_file_path}")
+
+
 def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):
     """
     Run tests on cpu, single gpu and multiple gpus
@@ -41,39 +58,46 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
 
     # run cpu / single-gpu tests
     if cpu or single_gpu:
+        print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===")
         command_unit_tests = (
             f"cd {_ROOT_DIR} && CUDA_VISIBLE_DEVICES={devices[0] if single_gpu else None} python -m pytest"
         )
-
-        print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===")
         print(command_unit_tests)
         subprocess.run(command_unit_tests, shell=True, capture_output=False, text=True)
 
     # run multi-gpu tests
     if multi_gpu:
         # distributed tests
+        print("\n=== RUN MULTI-GPU TESTS ===")
         run_distributed_tests_directory = _ROOT_DIR / "tests"
         run_distributed_tests_script = _ROOT_DIR / "tests" / "run_distributed_tests.sh"
         assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist."
         command_end_to_end_tests = (
             f"cd {run_distributed_tests_directory}; bash run_distributed_tests.sh {devices[0]} {devices[1]}"
         )
-
-        print("\n=== RUN MULTI-GPU TESTS ===")
         print(command_end_to_end_tests)
         subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True)
 
         # getting started example
-        run_distributed_tests_directory = _ROOT_DIR / "examples" / "getting_started"
-        run_distributed_tests_script = _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh"
-        assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist."
+        print("\n=== RUN GETTING STARTED EXAMPLE ===")
+        run_getting_started_example_directory = _ROOT_DIR / "examples" / "getting_started"
+        run_getting_started_example_script = (
+            _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh"
+        )
+        assert isfile(
+            run_getting_started_example_script
+        ), f"ERROR! {run_getting_started_example_script} does not exist."
         command_getting_started_example = (
-            f"cd {run_distributed_tests_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}"
+            f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}"
         )
-        print("\n=== RUN GETTING STARTED EXAMPLE ===")
         print(command_getting_started_example)
         subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
 
+        output_directory = join(run_getting_started_example_directory, "data", "mem_map")
+        clear_getting_started_example_output_directory(output_directory)
+
+    print("\n=== DONE ===")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Description of your program")

From 61bde9e7503193b972213453c934f8fb054d0042 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Wed, 14 Aug 2024 07:55:27 +0200
Subject: [PATCH 6/9] chore: typo in getting started example shell script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Max Lübbering <2804731+le1nux@users.noreply.github.com>
---
 examples/getting_started/run_getting_started_example.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh
index 8cafe0ab..888f00b9 100644
--- a/examples/getting_started/run_getting_started_example.sh
+++ b/examples/getting_started/run_getting_started_example.sh
@@ -22,7 +22,7 @@ if [[ $first_character =~ [^0-7] ]]   # if the first character is not an integer
 fi
 
 last_character=${1:0-1}
-if [[ $last_character =~ [^0-7] ]]   # if the first character is not an integer 0-7
+if [[ $last_character =~ [^0-7] ]]   # if the last character is not an integer 0-7
     then
         echo "Last character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
         exit

From efc1629c0c4ef29728dc9cec59e08763274ddf78 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Wed, 14 Aug 2024 09:22:04 +0200
Subject: [PATCH 7/9] refactor: restrict getting started example bash script to
 2 gpus

---
 .../run_getting_started_example.sh            | 26 +++++++------------
 tests/tests.py                                |  2 +-
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh
index 888f00b9..30555256 100644
--- a/examples/getting_started/run_getting_started_example.sh
+++ b/examples/getting_started/run_getting_started_example.sh
@@ -1,32 +1,26 @@
 #!/bin/sh
+set -e 
 
-# ---------------------------------------------------
-# bash run_getting_started_example.sh 0,1,2,3,4,5,6,7
-# ---------------------------------------------------
+# ---------------------------------------------
+# bash run_getting_started_example.sh 0 1
+# (can only be run on 2 GPUs using this script)
+# ---------------------------------------------
 
 #######################
 ### INPUT ARGUMENTS ###
 #######################
-if [ -z "$1" ]  # if input argument does not exist
+if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
   then
-    echo "Need to specify the GPU devices as arguments, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
+    echo "Need to specify 2 GPU devices as arguments, e.g. bash run_getting_started_example.sh 0 1"
     exit
 fi
-CUDA_VISIBLE_DEVICES=$1
-
-first_character=${1:0:1}
-if [[ $first_character =~ [^0-7] ]]   # if the first character is not an integer 0-7
+if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]]  # if one of the two input arguments is not an integer 0-7
     then
-        echo "First character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
+        echo "Need to specify integers 0-7 as arguments, e.g. bash run_getting_started_example.sh 0 1"
         exit
 fi
 
-last_character=${1:0-1}
-if [[ $last_character =~ [^0-7] ]]   # if the last character is not an integer 0-7
-    then
-        echo "Last character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7"
-        exit
-fi
+CUDA_VISIBLE_DEVICES="$1,$2"
 
 #############
 ### RUN #####
diff --git a/tests/tests.py b/tests/tests.py
index 77e99cbd..cb70445f 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -88,7 +88,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
             run_getting_started_example_script
         ), f"ERROR! {run_getting_started_example_script} does not exist."
         command_getting_started_example = (
-            f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}"
+            f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}"
         )
         print(command_getting_started_example)
         subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)

From ecb277c863657b910d828553a46b4533745b7eba Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Wed, 14 Aug 2024 10:20:46 +0200
Subject: [PATCH 8/9] test: check and clear checkpoint in getting started
 example

---
 tests/tests.py | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index cb70445f..ee363891 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -1,26 +1,52 @@
 import argparse
 import os
+import shutil
 import subprocess
+from datetime import datetime
 from os.path import isdir, isfile, join
 from pathlib import Path
 
 _ROOT_DIR = Path(__file__).parents[1]
 
 
-def clear_getting_started_example_output_directory(output_directory):
-    assert isdir(output_directory), f"ERROR! {output_directory} does not exist."
-    output_files = [
+def check_existence_and_clear_getting_started_example_output(
+    run_getting_started_example_directory: str, date_of_run: str
+):
+    # data
+    output_directory_data = join(run_getting_started_example_directory, "data", "mem_map")
+    output_files_data = [
         "redpajama_v2_samples_512_train.idx",
         "redpajama_v2_samples_512_test.idx",
         "redpajama_v2_samples_512_train.pbin",
         "redpajama_v2_samples_512_test.pbin",
     ]
     print()
-    for output_file in output_files:
-        output_file_path = join(output_directory, output_file)
+    for output_file_data in output_files_data:
+        output_file_path = join(output_directory_data, output_file_data)
         assert isfile(output_file_path), f"ERROR! {output_file_path} does not exist."
-        os.remove(output_file_path)
-        print(f"> removed {output_file_path}")
+        try:
+            os.remove(output_file_path)
+            print(f"> removed {output_file_path}")
+        except OSError as e:
+            print("Error: %s - %s." % (e.filename, e.strerror))
+
+    # checkpoint
+    output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints")
+    checkpoints = [elem for elem in os.listdir(output_directory_checkpoints) if elem.startswith("20")]
+    checkpoint_to_delete = None
+    for checkpoint in checkpoints:
+        # e.g. "2024-08-14__09-54-53_abcde" -> "2024-08-14__09-54-53"
+        date_of_checkpoint = "_".join(checkpoint.split("_")[:-1])
+        if date_of_checkpoint > date_of_run:
+            checkpoint_to_delete = join(output_directory_checkpoints, checkpoint)
+            break
+    assert checkpoint_to_delete is not None, f"ERROR! could not find a checkpoint with datetime > {date_of_run}"
+    assert isdir(checkpoint_to_delete), f"ERROR! {checkpoint_to_delete} does not exist"
+    try:
+        shutil.rmtree(checkpoint_to_delete)
+        print(f"> removed {checkpoint_to_delete}")
+    except OSError as e:
+        print("Error: %s - %s." % (e.filename, e.strerror))
 
 
 def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):
@@ -91,10 +117,10 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
             f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}"
         )
         print(command_getting_started_example)
+        date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
         subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
 
-        output_directory = join(run_getting_started_example_directory, "data", "mem_map")
-        clear_getting_started_example_output_directory(output_directory)
+        check_existence_and_clear_getting_started_example_output(run_getting_started_example_directory, date_of_run)
 
     print("\n=== DONE ===")
 

From 33c88c002abf1cab15c5e510addd27a6868b45e9 Mon Sep 17 00:00:00 2001
From: Felix Stollenwerk <felix.stollenwerk@ai.se>
Date: Thu, 15 Aug 2024 15:46:58 +0200
Subject: [PATCH 9/9] test: use f-strings in getting started example test

---
 tests/tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tests.py b/tests/tests.py
index ee363891..ce8078c5 100644
--- a/tests/tests.py
+++ b/tests/tests.py
@@ -28,7 +28,7 @@ def check_existence_and_clear_getting_started_example_output(
             os.remove(output_file_path)
             print(f"> removed {output_file_path}")
         except OSError as e:
-            print("Error: %s - %s." % (e.filename, e.strerror))
+            print(f"Error: {e.filename} - {e.strerror}.")
 
     # checkpoint
     output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints")
@@ -46,7 +46,7 @@ def check_existence_and_clear_getting_started_example_output(
         shutil.rmtree(checkpoint_to_delete)
         print(f"> removed {checkpoint_to_delete}")
     except OSError as e:
-        print("Error: %s - %s." % (e.filename, e.strerror))
+        print(f"Error: {e.filename} - {e.strerror}.")
 
 
 def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):