diff --git a/ludwig/constants.py b/ludwig/constants.py index eaad346f7ce..55c6cc0cf57 100644 --- a/ludwig/constants.py +++ b/ludwig/constants.py @@ -186,9 +186,12 @@ BATCH_SIZE = "batch_size" EVAL_BATCH_SIZE = "eval_batch_size" DEFAULT_BATCH_SIZE = "auto" -MAX_POSSIBLE_BATCH_SIZE = ( - 1099511627776 # 2^40. Used for `max_batch_size` config param. Not a hard constraint for `batch_size` config param. -) +# 2^40. Used for `max_batch_size` config param. Not a hard constraint for `batch_size` config param. +MAX_POSSIBLE_BATCH_SIZE = 1099511627776 +# min batch size. Used as a floor for batch size tuning. Not a hard constraint for `batch_size` config params. +MIN_POSSIBLE_BATCH_SIZE = 2 +# max batch size for dataset is 20% of dataset size +MAX_BATCH_SIZE_DATASET_FRACTION = 0.2 LEARNING_RATE = "learning_rate" INPUT_SIZE = "input_size" USE_BIAS = "use_bias" diff --git a/ludwig/utils/batch_size_tuner.py b/ludwig/utils/batch_size_tuner.py index cb0a3a8d22a..9977c1b04c1 100644 --- a/ludwig/utils/batch_size_tuner.py +++ b/ludwig/utils/batch_size_tuner.py @@ -8,6 +8,7 @@ import torch from ludwig.api_annotations import DeveloperAPI +from ludwig.constants import MAX_BATCH_SIZE_DATASET_FRACTION, MIN_POSSIBLE_BATCH_SIZE logger = logging.getLogger(__name__) @@ -26,19 +27,21 @@ def select_best_batch_size( max_batch_size = max_batch_size or dataset_len def _is_valid_batch_size(batch_size): - # make sure that batch size is valid (e.g. less than size of ds) - is_smaller_than_training_set = batch_size < dataset_len + # make sure that batch size is valid (e.g. less than 20% of ds size and max_batch_size) + is_smaller_than_training_set = batch_size <= MAX_BATCH_SIZE_DATASET_FRACTION * dataset_len is_under_max_batch_size = batch_size <= max_batch_size is_valid = is_smaller_than_training_set and is_under_max_batch_size if not is_valid: logger.info( - f"Batch size {batch_size} is invalid, must be smaller than training set size " - f"{dataset_len} and less than or equal to max batch size {max_batch_size}" + f"Batch size {batch_size} is invalid, must be less than or equal to " + f"{MAX_BATCH_SIZE_DATASET_FRACTION * 100}% dataset size " + f"({int(MAX_BATCH_SIZE_DATASET_FRACTION * dataset_len)} samples " + f"of {dataset_len}) and less than or equal to max batch size {max_batch_size}" ) return is_valid # Set 2 as the minimum batch size to account for batch norm. - batch_size = 2 + batch_size = MIN_POSSIBLE_BATCH_SIZE best_samples_per_sec = 0 best_batch_size = None @@ -71,6 +74,12 @@ def _is_valid_batch_size(batch_size): raise break + # Ensure that some batch size is found. + # `best_batch_size` can be None if the first batch size is invalid. + if best_batch_size is None: + logger.info("Could not tune batch size, using minimum batch size of 2") + best_batch_size = MIN_POSSIBLE_BATCH_SIZE + logger.info(f"Selected batch_size={best_batch_size}") return best_batch_size diff --git a/tests/integration_tests/test_ray.py b/tests/integration_tests/test_ray.py index 383e5a6614b..a3068d88d93 100644 --- a/tests/integration_tests/test_ray.py +++ b/tests/integration_tests/test_ray.py @@ -37,6 +37,7 @@ DATE, H3, IMAGE, + MAX_BATCH_SIZE_DATASET_FRACTION, NAME, NUMBER, PREPROCESSING, @@ -49,6 +50,7 @@ VECTOR, ) from ludwig.data.preprocessing import balance_data +from ludwig.data.split import DEFAULT_PROBABILITIES from ludwig.utils.data_utils import read_parquet from ludwig.utils.misc_utils import merge_dict from tests.integration_tests.utils import ( @@ -795,7 +797,7 @@ def _run_train_gpu_load_cpu(config, data_parquet): @pytest.mark.distributed @pytest.mark.parametrize( ("max_batch_size", "expected_final_batch_size", "expected_final_learning_rate"), - [(256, 128, 0.001), (32, 32, 0.001)], + [(256, None, 0.001), (8, 8, 0.001)], ) def test_tune_batch_size_lr_cpu( tmpdir, ray_cluster_2cpu, max_batch_size, expected_final_batch_size, expected_final_learning_rate @@ -818,11 +820,21 @@ def test_tune_batch_size_lr_cpu( backend_config = {**RAY_BACKEND_CONFIG} + num_samples = 200 csv_filename = os.path.join(tmpdir, "dataset.csv") - dataset_csv = generate_data(config["input_features"], config["output_features"], csv_filename, num_examples=200) + dataset_csv = generate_data( + config["input_features"], config["output_features"], csv_filename, num_examples=num_samples + ) dataset_parquet = create_data_set_to_use("parquet", dataset_csv) model = run_api_experiment(config, dataset=dataset_parquet, backend_config=backend_config) - assert model.config[TRAINER]["batch_size"] == expected_final_batch_size + + if expected_final_batch_size is not None: + assert model.config[TRAINER]["batch_size"] == expected_final_batch_size + else: + # If we don't specify a batch size, we should validate the batch size against the training dataset size + num_train_samples = num_samples * DEFAULT_PROBABILITIES[0] + assert 2 < model.config[TRAINER]["batch_size"] <= MAX_BATCH_SIZE_DATASET_FRACTION * num_train_samples + assert model.config[TRAINER]["learning_rate"] == expected_final_learning_rate diff --git a/tests/integration_tests/test_trainer.py b/tests/integration_tests/test_trainer.py index 0d9451e7f0a..62318290899 100644 --- a/tests/integration_tests/test_trainer.py +++ b/tests/integration_tests/test_trainer.py @@ -10,7 +10,7 @@ from ludwig.api import LudwigModel from ludwig.callbacks import Callback -from ludwig.constants import BATCH_SIZE, TRAINER +from ludwig.constants import BATCH_SIZE, MAX_BATCH_SIZE_DATASET_FRACTION, TRAINER from tests.integration_tests.utils import ( binary_feature, category_feature, @@ -94,8 +94,9 @@ def test_tune_batch_size_and_lr(tmpdir, eval_batch_size, is_cpu): vector_feature(), ] + num_samples = 30 csv_filename = os.path.join(tmpdir, "training.csv") - data_csv = generate_data(input_features, output_features, csv_filename) + data_csv = generate_data(input_features, output_features, csv_filename, num_examples=num_samples) val_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "validation.csv")) test_csv = shutil.copyfile(data_csv, os.path.join(tmpdir, "test.csv")) @@ -133,8 +134,8 @@ def check_postconditions(model): assert model.config_obj.trainer.batch_size != "auto" assert model.config_obj.trainer.batch_size > 1 - # 16 is the largest possible batch size for this dataset - assert model.config_obj.trainer.batch_size == 16 + # 4 is the largest possible batch size for this dataset (20% of dataset size) + assert model.config_obj.trainer.batch_size <= MAX_BATCH_SIZE_DATASET_FRACTION * num_samples assert model.config_obj.trainer.eval_batch_size != "auto" assert model.config_obj.trainer.eval_batch_size > 1