Merge pull request #211 from Modalities/fix/getting_started_example

Fix Getting Started Example
Modalities · Aug 15, 2024 · 88655fc · 88655fc
2 parents db18bfb + 33c88c0
commit 88655fc
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 10 deletions.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -13,4 +13,5 @@ This PR ..
 - [ ] I have merged the latest version of the target branch into this feature branch
 - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc.
 - [ ] I have run a sample config for model training
-- [ ] I have checked that all tests run through (`python tests/tests.py`)
+- [ ] I have checked that all tests run through (`python tests/tests.py`)
+- [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`)
diff --git a/examples/getting_started/example_config.yaml b/examples/getting_started/example_config.yaml
@@ -254,8 +254,7 @@ batch_progress_subscriber:
   component_key: progress_subscriber
   variant_key: rich
   config:
-    local_rank: ${settings.cuda_env.local_rank}
-    world_size: ${settings.cuda_env.world_size}
+    global_rank: ${settings.cuda_env.global_rank}
     global_num_seen_steps:
       component_key: number_conversion
       variant_key: num_steps_from_num_tokens
@@ -277,7 +276,7 @@ evaluation_subscriber:
   component_key: results_subscriber
   variant_key: wandb
   config:
-    local_rank: ${settings.cuda_env.local_rank}
+    global_rank: ${settings.cuda_env.global_rank}
     project: modalities_getting_started
     mode: OFFLINE
     experiment_id: ${settings.experiment_id}

diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+set -e 
+
+# ---------------------------------------------
+# bash run_getting_started_example.sh 0 1
+# (can only be run on 2 GPUs using this script)
+# ---------------------------------------------
+
+#######################
+### INPUT ARGUMENTS ###
+#######################
+if [ -z "$1" ] || [ -z "$2" ]  # if one of the two input arguments does not exist
+  then
+    echo "Need to specify 2 GPU devices as arguments, e.g. bash run_getting_started_example.sh 0 1"
+    exit
+fi
+if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]]  # if one of the two input arguments is not an integer 0-7
+    then
+        echo "Need to specify integers 0-7 as arguments, e.g. bash run_getting_started_example.sh 0 1"
+        exit
+fi
+
+CUDA_VISIBLE_DEVICES="$1,$2"
+
+#############
+### RUN #####
+#############
+echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES
+
+modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl
+modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl
+modalities data pack_encoded_data example_dataset_config_train.yaml
+modalities data pack_encoded_data example_dataset_config_test.yaml
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path example_config.yaml
diff --git a/tests/run_distributed_tests.sh b/tests/run_distributed_tests.sh
@@ -23,7 +23,7 @@ COVERAGE=--no-cov
 
 #############
 ### TESTS ###
-#################
+#############
 # test_fsdp_to_disc_checkpointing
 CUDA_VISIBLE_DEVICES=$DEV0,$DEV1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 $(which pytest) checkpointing/test_fsdp_to_disc_checkpointing.py $COVERAGE
 

diff --git a/tests/tests.py b/tests/tests.py
@@ -1,11 +1,54 @@
 import argparse
+import os
+import shutil
 import subprocess
-from os.path import isfile
+from datetime import datetime
+from os.path import isdir, isfile, join
 from pathlib import Path
 
 _ROOT_DIR = Path(__file__).parents[1]
 
 
+def check_existence_and_clear_getting_started_example_output(
+    run_getting_started_example_directory: str, date_of_run: str
+):
+    # data
+    output_directory_data = join(run_getting_started_example_directory, "data", "mem_map")
+    output_files_data = [
+        "redpajama_v2_samples_512_train.idx",
+        "redpajama_v2_samples_512_test.idx",
+        "redpajama_v2_samples_512_train.pbin",
+        "redpajama_v2_samples_512_test.pbin",
+    ]
+    print()
+    for output_file_data in output_files_data:
+        output_file_path = join(output_directory_data, output_file_data)
+        assert isfile(output_file_path), f"ERROR! {output_file_path} does not exist."
+        try:
+            os.remove(output_file_path)
+            print(f"> removed {output_file_path}")
+        except OSError as e:
+            print(f"Error: {e.filename} - {e.strerror}.")
+
+    # checkpoint
+    output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints")
+    checkpoints = [elem for elem in os.listdir(output_directory_checkpoints) if elem.startswith("20")]
+    checkpoint_to_delete = None
+    for checkpoint in checkpoints:
+        # e.g. "2024-08-14__09-54-53_abcde" -> "2024-08-14__09-54-53"
+        date_of_checkpoint = "_".join(checkpoint.split("_")[:-1])
+        if date_of_checkpoint > date_of_run:
+            checkpoint_to_delete = join(output_directory_checkpoints, checkpoint)
+            break
+    assert checkpoint_to_delete is not None, f"ERROR! could not find a checkpoint with datetime > {date_of_run}"
+    assert isdir(checkpoint_to_delete), f"ERROR! {checkpoint_to_delete} does not exist"
+    try:
+        shutil.rmtree(checkpoint_to_delete)
+        print(f"> removed {checkpoint_to_delete}")
+    except OSError as e:
+        print(f"Error: {e.filename} - {e.strerror}.")
+
+
 def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):
     """
     Run tests on cpu, single gpu and multiple gpus
@@ -41,27 +84,46 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d
 
     # run cpu / single-gpu tests
     if cpu or single_gpu:
+        print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===")
         command_unit_tests = (
             f"cd {_ROOT_DIR} && CUDA_VISIBLE_DEVICES={devices[0] if single_gpu else None} python -m pytest"
         )
-
-        print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===")
         print(command_unit_tests)
         subprocess.run(command_unit_tests, shell=True, capture_output=False, text=True)
 
     # run multi-gpu tests
     if multi_gpu:
+        # distributed tests
+        print("\n=== RUN MULTI-GPU TESTS ===")
         run_distributed_tests_directory = _ROOT_DIR / "tests"
         run_distributed_tests_script = _ROOT_DIR / "tests" / "run_distributed_tests.sh"
         assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist."
         command_end_to_end_tests = (
             f"cd {run_distributed_tests_directory}; bash run_distributed_tests.sh {devices[0]} {devices[1]}"
         )
-
-        print("\n=== RUN MULTI-GPU TESTS ===")
         print(command_end_to_end_tests)
         subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True)
 
+        # getting started example
+        print("\n=== RUN GETTING STARTED EXAMPLE ===")
+        run_getting_started_example_directory = _ROOT_DIR / "examples" / "getting_started"
+        run_getting_started_example_script = (
+            _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh"
+        )
+        assert isfile(
+            run_getting_started_example_script
+        ), f"ERROR! {run_getting_started_example_script} does not exist."
+        command_getting_started_example = (
+            f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}"
+        )
+        print(command_getting_started_example)
+        date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
+        subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)
+
+        check_existence_and_clear_getting_started_example_output(run_getting_started_example_directory, date_of_run)
+
+    print("\n=== DONE ===")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Description of your program")