Skip to content

Commit

Permalink
Merge pull request #211 from Modalities/fix/getting_started_example
Browse files Browse the repository at this point in the history
Fix Getting Started Example
  • Loading branch information
le1nux authored Aug 15, 2024
2 parents db18bfb + 33c88c0 commit 88655fc
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 10 deletions.
3 changes: 2 additions & 1 deletion .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ This PR ..
- [ ] I have merged the latest version of the target branch into this feature branch
- [ ] I have reviewed my own code w.r.t. correct implementation, missing type hints, proper documentation, etc.
- [ ] I have run a sample config for model training
- [ ] I have checked that all tests run through (`python tests/tests.py`)
- [ ] I have checked that all tests run through (`python tests/tests.py`)
- [ ] I have updated the internal changelog (`CHANGELOG_DEV.md`)
5 changes: 2 additions & 3 deletions examples/getting_started/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,7 @@ batch_progress_subscriber:
component_key: progress_subscriber
variant_key: rich
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_rank: ${settings.cuda_env.global_rank}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
Expand All @@ -277,7 +276,7 @@ evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
config:
local_rank: ${settings.cuda_env.local_rank}
global_rank: ${settings.cuda_env.global_rank}
project: modalities_getting_started
mode: OFFLINE
experiment_id: ${settings.experiment_id}
Expand Down
34 changes: 34 additions & 0 deletions examples/getting_started/run_getting_started_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/sh
set -e

# ---------------------------------------------
# bash run_getting_started_example.sh 0 1
# (can only be run on 2 GPUs using this script)
# ---------------------------------------------

#######################
### INPUT ARGUMENTS ###
#######################
if [ -z "$1" ] || [ -z "$2" ] # if one of the two input arguments does not exist
then
echo "Need to specify 2 GPU devices as arguments, e.g. bash run_getting_started_example.sh 0 1"
exit
fi
if [[ $1 =~ [^0-7] ]] || [[ $2 =~ [^0-7] ]] # if one of the two input arguments is not an integer 0-7
then
echo "Need to specify integers 0-7 as arguments, e.g. bash run_getting_started_example.sh 0 1"
exit
fi

CUDA_VISIBLE_DEVICES="$1,$2"

#############
### RUN #####
#############
echo "> run getting_started_examples on CUDA_VISIBLE_DEVICES="$CUDA_VISIBLE_DEVICES

modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_train.idx data/raw/redpajama_v2_samples_512_train.jsonl
modalities data create_raw_index --index_path data/mem_map/redpajama_v2_samples_512_test.idx data/raw/redpajama_v2_samples_512_test.jsonl
modalities data pack_encoded_data example_dataset_config_train.yaml
modalities data pack_encoded_data example_dataset_config_test.yaml
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES torchrun --rdzv-endpoint localhost:29505 --nnodes 1 --nproc_per_node 2 $(which modalities) run --config_file_path example_config.yaml
2 changes: 1 addition & 1 deletion tests/run_distributed_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ COVERAGE=--no-cov

#############
### TESTS ###
#################
#############
# test_fsdp_to_disc_checkpointing
CUDA_VISIBLE_DEVICES=$DEV0,$DEV1 torchrun --rdzv-endpoint localhost:29502 --nnodes 1 --nproc_per_node 2 $(which pytest) checkpointing/test_fsdp_to_disc_checkpointing.py $COVERAGE

Expand Down
72 changes: 67 additions & 5 deletions tests/tests.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,54 @@
import argparse
import os
import shutil
import subprocess
from os.path import isfile
from datetime import datetime
from os.path import isdir, isfile, join
from pathlib import Path

_ROOT_DIR = Path(__file__).parents[1]


def check_existence_and_clear_getting_started_example_output(
run_getting_started_example_directory: str, date_of_run: str
):
# data
output_directory_data = join(run_getting_started_example_directory, "data", "mem_map")
output_files_data = [
"redpajama_v2_samples_512_train.idx",
"redpajama_v2_samples_512_test.idx",
"redpajama_v2_samples_512_train.pbin",
"redpajama_v2_samples_512_test.pbin",
]
print()
for output_file_data in output_files_data:
output_file_path = join(output_directory_data, output_file_data)
assert isfile(output_file_path), f"ERROR! {output_file_path} does not exist."
try:
os.remove(output_file_path)
print(f"> removed {output_file_path}")
except OSError as e:
print(f"Error: {e.filename} - {e.strerror}.")

# checkpoint
output_directory_checkpoints = join(run_getting_started_example_directory, "checkpoints")
checkpoints = [elem for elem in os.listdir(output_directory_checkpoints) if elem.startswith("20")]
checkpoint_to_delete = None
for checkpoint in checkpoints:
# e.g. "2024-08-14__09-54-53_abcde" -> "2024-08-14__09-54-53"
date_of_checkpoint = "_".join(checkpoint.split("_")[:-1])
if date_of_checkpoint > date_of_run:
checkpoint_to_delete = join(output_directory_checkpoints, checkpoint)
break
assert checkpoint_to_delete is not None, f"ERROR! could not find a checkpoint with datetime > {date_of_run}"
assert isdir(checkpoint_to_delete), f"ERROR! {checkpoint_to_delete} does not exist"
try:
shutil.rmtree(checkpoint_to_delete)
print(f"> removed {checkpoint_to_delete}")
except OSError as e:
print(f"Error: {e.filename} - {e.strerror}.")


def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, devices: str = "0,1"):
"""
Run tests on cpu, single gpu and multiple gpus
Expand Down Expand Up @@ -41,27 +84,46 @@ def main(cpu: bool = False, single_gpu: bool = False, multi_gpu: bool = False, d

# run cpu / single-gpu tests
if cpu or single_gpu:
print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===")
command_unit_tests = (
f"cd {_ROOT_DIR} && CUDA_VISIBLE_DEVICES={devices[0] if single_gpu else None} python -m pytest"
)

print("\n=== RUN CPU & SINGLE-GPU TESTS ===" if single_gpu else "\n=== RUN CPU TESTS ===")
print(command_unit_tests)
subprocess.run(command_unit_tests, shell=True, capture_output=False, text=True)

# run multi-gpu tests
if multi_gpu:
# distributed tests
print("\n=== RUN MULTI-GPU TESTS ===")
run_distributed_tests_directory = _ROOT_DIR / "tests"
run_distributed_tests_script = _ROOT_DIR / "tests" / "run_distributed_tests.sh"
assert isfile(run_distributed_tests_script), f"ERROR! {run_distributed_tests_script} does not exist."
command_end_to_end_tests = (
f"cd {run_distributed_tests_directory}; bash run_distributed_tests.sh {devices[0]} {devices[1]}"
)

print("\n=== RUN MULTI-GPU TESTS ===")
print(command_end_to_end_tests)
subprocess.run(command_end_to_end_tests, shell=True, capture_output=False, text=True)

# getting started example
print("\n=== RUN GETTING STARTED EXAMPLE ===")
run_getting_started_example_directory = _ROOT_DIR / "examples" / "getting_started"
run_getting_started_example_script = (
_ROOT_DIR / "examples" / "getting_started" / "run_getting_started_example.sh"
)
assert isfile(
run_getting_started_example_script
), f"ERROR! {run_getting_started_example_script} does not exist."
command_getting_started_example = (
f"cd {run_getting_started_example_directory}; bash run_getting_started_example.sh {devices[0]} {devices[1]}"
)
print(command_getting_started_example)
date_of_run = datetime.now().strftime("%Y-%m-%d__%H-%M-%S")
subprocess.run(command_getting_started_example, shell=True, capture_output=False, text=True)

check_existence_and_clear_getting_started_example_output(run_getting_started_example_directory, date_of_run)

print("\n=== DONE ===")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Description of your program")
Expand Down

0 comments on commit 88655fc

Please sign in to comment.