From a80222d3ae3289ba0f5fcf9c0ed309b1a761846f Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 6 Aug 2024 11:53:14 +0200 Subject: [PATCH 1/9] fix: update getting started example config --- examples/getting_started/example_config.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/getting_started/example_config.yaml b/examples/getting_started/example_config.yaml index 0ba264c3..14f85dba 100644 --- a/examples/getting_started/example_config.yaml +++ b/examples/getting_started/example_config.yaml @@ -254,8 +254,7 @@ batch_progress_subscriber: component_key: progress_subscriber variant_key: rich config: - local_rank: ${settings.cuda_env.local_rank} - world_size: ${settings.cuda_env.world_size} + global_rank: ${settings.cuda_env.global_rank} global_num_seen_steps: component_key: number_conversion variant_key: num_steps_from_num_tokens @@ -277,7 +276,7 @@ evaluation_subscriber: component_key: results_subscriber variant_key: wandb config: - local_rank: ${settings.cuda_env.local_rank} + global_rank: ${settings.cuda_env.global_rank} project: modalities_getting_started mode: OFFLINE experiment_id: ${settings.experiment_id} From 11dfbfae7fde62a4eb09cd86afb2812f6c5e1619 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 6 Aug 2024 11:58:28 +0200 Subject: [PATCH 2/9] feat: add shell script to run complete getting started example --- .../run_getting_started_example.sh | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 examples/getting_started/run_getting_started_example.sh diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh new file mode 100644 index 00000000..8cafe0ab --- /dev/null +++ b/examples/getting_started/run_getting_started_example.sh @@ -0,0 +1,40 @@ +#!/bin/sh + +# --------------------------------------------------- +# bash run_getting_started_example.sh 0,1,2,3,4,5,6,7 +# --------------------------------------------------- + +####################### +### INPUT ARGUMENTS ### +####################### +if [ -z "$1" ] # if input argument does not exist + then + echo "Need to specify the GPU devices as arguments, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" + exit +fi +CUDA_VISIBLE_DEVICES=$1 + +first_character=${1:0:1} +if [[ $first_character =~ [^0-7] ]] # if the first character is not an integer 0-7 + then + echo "First character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" + exit +fi + +last_character=${1:0-1} +if [[ $last_character =~ [^0-7] ]] # if the first character is not an integer 0-7 + then + echo "Last character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" + exit +fi + +############# +### RUN ##### +############# +echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES + +modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl +modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl +modalities data pack_encoded_data example_dataset_config_train.yaml +modalities data pack_encoded_data example_dataset_config_test.yaml +CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path example_config.yaml From 6974d0c3c3b09693588aa78a2373a0ddb758479e Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 6 Aug 2024 12:06:12 +0200 Subject: [PATCH 3/9] chore: add checks to PR template --- .github/pull_request_template.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 4c04b72f..de2df236 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -13,4 +13,6 @@ This PR .. - [ ] I have merged the latest version of the target branch into this feature branch - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc. - [ ] I have run a sample config for model training -- [ ] I have checked that all tests run through (`python tests/tests.py`) \ No newline at end of file +- [ ] I have run the getting started example (`bash run_getting_started_example.sh`) +- [ ] I have checked that all tests run through (`python tests/tests.py`) +- [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`) \ No newline at end of file From 54ea4c1cccf5b7734efa9bc2e71158d4f7ec2a73 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 6 Aug 2024 14:02:16 +0200 Subject: [PATCH 4/9] test: include getting started example in tests --- .github/pull_request_template.md | 1 - tests/run_distributed_tests.sh | 2 +- tests/tests.py | 12 ++++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index de2df236..7c211215 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -13,6 +13,5 @@ This PR .. - [ ] I have merged the latest version of the target branch into this feature branch - [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc. - [ ] I have run a sample config for model training -- [ ] I have run the getting started example (`bash run_getting_started_example.sh`) - [ ] I have checked that all tests run through (`python tests/tests.py`) - [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`) \ No newline at end of file diff --git a/tests/run_distributed_tests.sh b/tests/run_distributed_tests.sh index ed1c49fc..71619df3 100644 --- a/tests/run_distributed_tests.sh +++ b/tests/run_distributed_tests.sh @@ -23,7 +23,7 @@ COVERAGE=--no-cov ############# ### TESTS ### -################# +############# # test_fsdp_to_disc_checkpointing CUDA_VISIBLE_DEVICES=$DEV0,$DEV1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 $(which pytest) checkpointing/test_fsdp_to_disc_checkpointing.py $COVERAGE diff --git a/tests/tests.py b/tests/tests.py index ea565fb6..1412c8c5 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -51,6 +51,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d # run multi-gpu tests if multi_gpu: + # distributed tests run_distributed_tests_directory = _ROOT_DIR / "tests" run_distributed_tests_script = _ROOT_DIR / "tests" / "run_distributed_tests.sh" assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist." @@ -62,6 +63,17 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d print(command_end_to_end_tests) subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True) + # getting started example + run_distributed_tests_directory = _ROOT_DIR / "examples" / "getting_started" + run_distributed_tests_script = _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh" + assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist." + command_getting_started_example = ( + f"cd {run_distributed_tests_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}" + ) + print("\n=== RUN GETTING STARTED EXAMPLE ===") + print(command_getting_started_example) + subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True) + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Description of your program") From 8c783bbf1c9fda64ed707ae8ac5f6f1052227489 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 6 Aug 2024 17:29:07 +0200 Subject: [PATCH 5/9] test: include getting started example in tests (cleanup) --- tests/tests.py | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index 1412c8c5..77e99cbd 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,11 +1,28 @@ import argparse +import os import subprocess -from os.path import isfile +from os.path import isdir, isfile, join from pathlib import Path _ROOT_DIR = Path(__file__).parents[1] +def clear_getting_started_example_output_directory(output_directory): + assert isdir(output_directory), f"ERROR! {output_directory} does not exist." + output_files = [ + "redpajama_v2_samples_512_train.idx", + "redpajama_v2_samples_512_test.idx", + "redpajama_v2_samples_512_train.pbin", + "redpajama_v2_samples_512_test.pbin", + ] + print() + for output_file in output_files: + output_file_path = join(output_directory, output_file) + assert isfile(output_file_path), f"ERROR! {output_file_path} does not exist." + os.remove(output_file_path) + print(f"> removed {output_file_path}") + + def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"): """ Run tests on cpu, single gpu and multiple gpus @@ -41,39 +58,46 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d # run cpu / single-gpu tests if cpu or single_gpu: + print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===") command_unit_tests = ( f"cd {_ROOT_DIR} && CUDA_VISIBLE_DEVICES={devices[0] if single_gpu else None} python -m pytest" ) - - print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===") print(command_unit_tests) subprocess.run(command_unit_tests, shell=True, capture_output=False, text=True) # run multi-gpu tests if multi_gpu: # distributed tests + print("\n=== RUN MULTI-GPU TESTS ===") run_distributed_tests_directory = _ROOT_DIR / "tests" run_distributed_tests_script = _ROOT_DIR / "tests" / "run_distributed_tests.sh" assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist." command_end_to_end_tests = ( f"cd {run_distributed_tests_directory}; bash run_distributed_tests.sh {devices[0]} {devices[1]}" ) - - print("\n=== RUN MULTI-GPU TESTS ===") print(command_end_to_end_tests) subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True) # getting started example - run_distributed_tests_directory = _ROOT_DIR / "examples" / "getting_started" - run_distributed_tests_script = _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh" - assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist." + print("\n=== RUN GETTING STARTED EXAMPLE ===") + run_getting_started_example_directory = _ROOT_DIR / "examples" / "getting_started" + run_getting_started_example_script = ( + _ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh" + ) + assert isfile( + run_getting_started_example_script + ), f"ERROR! {run_getting_started_example_script} does not exist." command_getting_started_example = ( - f"cd {run_distributed_tests_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}" + f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}" ) - print("\n=== RUN GETTING STARTED EXAMPLE ===") print(command_getting_started_example) subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True) + output_directory = join(run_getting_started_example_directory, "data", "mem_map") + clear_getting_started_example_output_directory(output_directory) + + print("\n=== DONE ===") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Description of your program") From 61bde9e7503193b972213453c934f8fb054d0042 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Wed, 14 Aug 2024 07:55:27 +0200 Subject: [PATCH 6/9] chore: typo in getting started example shell script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Max Lübbering <2804731+le1nux@users.noreply.github.com> --- examples/getting_started/run_getting_started_example.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh index 8cafe0ab..888f00b9 100644 --- a/examples/getting_started/run_getting_started_example.sh +++ b/examples/getting_started/run_getting_started_example.sh @@ -22,7 +22,7 @@ if [[ $first_character =~ [^0-7] ]] # if the first character is not an integer fi last_character=${1:0-1} -if [[ $last_character =~ [^0-7] ]] # if the first character is not an integer 0-7 +if [[ $last_character =~ [^0-7] ]] # if the last character is not an integer 0-7 then echo "Last character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" exit From efc1629c0c4ef29728dc9cec59e08763274ddf78 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Wed, 14 Aug 2024 09:22:04 +0200 Subject: [PATCH 7/9] refactor: restrict getting started example bash script to 2 gpus --- .../run_getting_started_example.sh | 26 +++++++------------ tests/tests.py | 2 +- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/examples/getting_started/run_getting_started_example.sh b/examples/getting_started/run_getting_started_example.sh index 888f00b9..30555256 100644 --- a/examples/getting_started/run_getting_started_example.sh +++ b/examples/getting_started/run_getting_started_example.sh @@ -1,32 +1,26 @@ #!/bin/sh +set -e -# --------------------------------------------------- -# bash run_getting_started_example.sh 0,1,2,3,4,5,6,7 -# --------------------------------------------------- +# --------------------------------------------- +# bash run_getting_started_example.sh 0 1 +# (can only be run on 2 GPUs using this script) +# --------------------------------------------- ####################### ### INPUT ARGUMENTS ### ####################### -if [ -z "$1" ] # if input argument does not exist +if [ -z "$1" ] || [ -z "$2" ] # if one of the two input arguments does not exist then - echo "Need to specify the GPU devices as arguments, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" + echo "Need to specify 2 GPU devices as arguments, e.g. bash run_getting_started_example.sh 0 1" exit fi -CUDA_VISIBLE_DEVICES=$1 - -first_character=${1:0:1} -if [[ $first_character =~ [^0-7] ]] # if the first character is not an integer 0-7 +if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]] # if one of the two input arguments is not an integer 0-7 then - echo "First character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" + echo "Need to specify integers 0-7 as arguments, e.g. bash run_getting_started_example.sh 0 1" exit fi -last_character=${1:0-1} -if [[ $last_character =~ [^0-7] ]] # if the last character is not an integer 0-7 - then - echo "Last character of specified argument needs to be an integer, e.g. bash run_getting_started_example.sh 0,1,2,3,4,5,6,7" - exit -fi +CUDA_VISIBLE_DEVICES="$1,$2" ############# ### RUN ##### diff --git a/tests/tests.py b/tests/tests.py index 77e99cbd..cb70445f 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -88,7 +88,7 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d run_getting_started_example_script ), f"ERROR! {run_getting_started_example_script} does not exist." command_getting_started_example = ( - f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]},{devices[1]}" + f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}" ) print(command_getting_started_example) subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True) From ecb277c863657b910d828553a46b4533745b7eba Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Wed, 14 Aug 2024 10:20:46 +0200 Subject: [PATCH 8/9] test: check and clear checkpoint in getting started example --- tests/tests.py | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index cb70445f..ee363891 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -1,26 +1,52 @@ import argparse import os +import shutil import subprocess +from datetime import datetime from os.path import isdir, isfile, join from pathlib import Path _ROOT_DIR = Path(__file__).parents[1] -def clear_getting_started_example_output_directory(output_directory): - assert isdir(output_directory), f"ERROR! {output_directory} does not exist." - output_files = [ +def check_existence_and_clear_getting_started_example_output( + run_getting_started_example_directory: str, date_of_run: str +): + # data + output_directory_data = join(run_getting_started_example_directory, "data", "mem_map") + output_files_data = [ "redpajama_v2_samples_512_train.idx", "redpajama_v2_samples_512_test.idx", "redpajama_v2_samples_512_train.pbin", "redpajama_v2_samples_512_test.pbin", ] print() - for output_file in output_files: - output_file_path = join(output_directory, output_file) + for output_file_data in output_files_data: + output_file_path = join(output_directory_data, output_file_data) assert isfile(output_file_path), f"ERROR! {output_file_path} does not exist." - os.remove(output_file_path) - print(f"> removed {output_file_path}") + try: + os.remove(output_file_path) + print(f"> removed {output_file_path}") + except OSError as e: + print("Error: %s - %s." % (e.filename, e.strerror)) + + # checkpoint + output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints") + checkpoints = [elem for elem in os.listdir(output_directory_checkpoints) if elem.startswith("20")] + checkpoint_to_delete = None + for checkpoint in checkpoints: + # e.g. "2024-08-14__09-54-53_abcde" -> "2024-08-14__09-54-53" + date_of_checkpoint = "_".join(checkpoint.split("_")[:-1]) + if date_of_checkpoint > date_of_run: + checkpoint_to_delete = join(output_directory_checkpoints, checkpoint) + break + assert checkpoint_to_delete is not None, f"ERROR! could not find a checkpoint with datetime > {date_of_run}" + assert isdir(checkpoint_to_delete), f"ERROR! {checkpoint_to_delete} does not exist" + try: + shutil.rmtree(checkpoint_to_delete) + print(f"> removed {checkpoint_to_delete}") + except OSError as e: + print("Error: %s - %s." % (e.filename, e.strerror)) def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"): @@ -91,10 +117,10 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}" ) print(command_getting_started_example) + date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S") subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True) - output_directory = join(run_getting_started_example_directory, "data", "mem_map") - clear_getting_started_example_output_directory(output_directory) + check_existence_and_clear_getting_started_example_output(run_getting_started_example_directory, date_of_run) print("\n=== DONE ===") From 33c88c002abf1cab15c5e510addd27a6868b45e9 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Thu, 15 Aug 2024 15:46:58 +0200 Subject: [PATCH 9/9] test: use f-strings in getting started example test --- tests/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tests.py b/tests/tests.py index ee363891..ce8078c5 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -28,7 +28,7 @@ def check_existence_and_clear_getting_started_example_output( os.remove(output_file_path) print(f"> removed {output_file_path}") except OSError as e: - print("Error: %s - %s." % (e.filename, e.strerror)) + print(f"Error: {e.filename} - {e.strerror}.") # checkpoint output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints") @@ -46,7 +46,7 @@ def check_existence_and_clear_getting_started_example_output( shutil.rmtree(checkpoint_to_delete) print(f"> removed {checkpoint_to_delete}") except OSError as e: - print("Error: %s - %s." % (e.filename, e.strerror)) + print(f"Error: {e.filename} - {e.strerror}.") def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):