From 0545159ad3dbc029a8fb97d2bba2d23dcc161bec Mon Sep 17 00:00:00 2001 From: Langshi Chen Date: Mon, 24 Apr 2023 16:18:51 +0800 Subject: [PATCH] [DIST] Set data_sync_drop_remainder as true by default. (#143) 1. set data_sync_drop_remainder to true by default. Signed-off-by: langshi.cls --- docs/tutorial/ranking/criteo/train.py | 24 +++++++++++-------- docs/tutorial/ranking/taobao/train.py | 11 +++++---- hybridbackend/tensorflow/data/__init__.py | 2 +- .../data/tests/sync_replicas_dataset_test.py | 3 ++- .../tensorflow/estimator/estimator.py | 6 ++--- hybridbackend/tensorflow/keras/model.py | 6 ++--- 6 files changed, 30 insertions(+), 22 deletions(-) diff --git a/docs/tutorial/ranking/criteo/train.py b/docs/tutorial/ranking/criteo/train.py index ec30cf0d..94c171d5 100755 --- a/docs/tutorial/ranking/criteo/train.py +++ b/docs/tutorial/ranking/criteo/train.py @@ -112,13 +112,17 @@ def train(self, filenames): self._args.top_mlp_dims) loss = self.compute_loss(logits, labels) step = tf.train.get_or_create_global_step() - train_op = sgd_decay_optimize( - loss, - lr_initial_value=self._args.lr_initial_value, - lr_warmup_steps=self._args.lr_warmup_steps, - lr_decay_start_step=self._args.lr_decay_start_step, - lr_decay_steps=self._args.lr_decay_steps) - return step, loss, train_op + train_auc, train_auc_update_op = hb.metrics.auc( + labels=labels, + predictions=logits, name='train_auc') + with tf.control_dependencies([train_auc_update_op]): + train_op = sgd_decay_optimize( + loss, + lr_initial_value=self._args.lr_initial_value, + lr_warmup_steps=self._args.lr_warmup_steps, + lr_decay_start_step=self._args.lr_decay_start_step, + lr_decay_steps=self._args.lr_decay_steps) + return step, loss, train_op, train_auc def evaluate(self, filenames): r'''Evaluate model. @@ -160,7 +164,7 @@ def main(args): train_filenames = args.filenames eval_filenames = args.filenames model = RankingModel(args) - step, loss, train_op = model.train(train_filenames) + step, loss, train_op, train_auc = model.train(train_filenames) hooks = [] if args.eval_every_n_iter is not None: @@ -171,7 +175,7 @@ def main(args): if args.log_every_n_iter is not None: hooks.append( tf.train.LoggingTensorHook( - {'step': step, 'loss': loss}, + {'step': step, 'loss': loss, 'train_auc': train_auc}, every_n_iter=args.log_every_n_iter)) if args.train_max_steps is not None: hooks.append(tf.train.StopAtStepHook(args.train_max_steps)) @@ -236,5 +240,5 @@ def main(args): disable_imputation=parsed.disable_imputation, disable_transform=True, override_embedding_size=parsed.embedding_dim) - with hb.scope(): + with hb.scope(data_sync_drop_remainder=False): main(parsed) diff --git a/docs/tutorial/ranking/taobao/train.py b/docs/tutorial/ranking/taobao/train.py index 5a6d83b8..3d5c30a5 100755 --- a/docs/tutorial/ranking/taobao/train.py +++ b/docs/tutorial/ranking/taobao/train.py @@ -113,8 +113,11 @@ def train(self, filenames): loss = self.compute_loss(logits, labels) step = tf.train.get_or_create_global_step() opt = tf.train.AdagradOptimizer(learning_rate=self._args.lr) - train_op = opt.minimize(loss, global_step=step) - return step, loss, train_op + train_auc, train_auc_update_op = hb.metrics.auc( + labels=labels, predictions=logits, name='train_auc') + with tf.control_dependencies([train_auc_update_op]): + train_op = opt.minimize(loss, global_step=step) + return step, loss, train_op, train_auc def evaluate(self, filenames): r'''Evaluate model. @@ -148,7 +151,7 @@ def main(args): train_filenames = args.filenames eval_filenames = args.filenames model = RankingModel(args) - step, loss, train_op = model.train(train_filenames) + step, loss, train_op, train_auc = model.train(train_filenames) hooks = [] if args.eval_every_n_iter is not None: @@ -159,7 +162,7 @@ def main(args): if args.log_every_n_iter is not None: hooks.append( tf.train.LoggingTensorHook( - {'step': step, 'loss': loss}, + {'step': step, 'loss': loss, 'train_auc': train_auc}, every_n_iter=args.log_every_n_iter)) if args.train_max_steps is not None: hooks.append(tf.train.StopAtStepHook(args.train_max_steps)) diff --git a/hybridbackend/tensorflow/data/__init__.py b/hybridbackend/tensorflow/data/__init__.py index 620df850..ad49f79d 100644 --- a/hybridbackend/tensorflow/data/__init__.py +++ b/hybridbackend/tensorflow/data/__init__.py @@ -42,4 +42,4 @@ _ = ( _ctx.get().options .register('data_batch_count', 1) - .register('data_sync_drop_remainder', False)) + .register('data_sync_drop_remainder', True)) diff --git a/hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py b/hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py index bc141b10..5c3f644e 100644 --- a/hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py +++ b/hybridbackend/tensorflow/data/tests/sync_replicas_dataset_test.py @@ -64,7 +64,8 @@ def _test_distributed(rank): batch_size = 10 with tf.Graph().as_default(): - with hb.scope(mode=tf.estimator.ModeKeys.TRAIN): + with hb.scope( + data_sync_drop_remainder=False, mode=tf.estimator.ModeKeys.TRAIN): with tf.device('/cpu:0'): ds = tf.data.Dataset.range(100 + rank * 50) ds = ds.batch(batch_size=batch_size) diff --git a/hybridbackend/tensorflow/estimator/estimator.py b/hybridbackend/tensorflow/estimator/estimator.py index 6f15dc96..cf901525 100644 --- a/hybridbackend/tensorflow/estimator/estimator.py +++ b/hybridbackend/tensorflow/estimator/estimator.py @@ -183,9 +183,9 @@ def __init__(self, model_fn, **kwargs): ''' kwargs['config'] = RunConfig.build(prototype=kwargs.pop('config', None)) model_dir = kwargs.get('model_dir', None) - self._train_drop_remainder = kwargs.pop('train_drop_remainder', None) - self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', None) - self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', None) + self._train_drop_remainder = kwargs.pop('train_drop_remainder', True) + self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', True) + self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', True) super().__init__( wraps_model_fn(model_fn, model_dir, kwargs['config']), diff --git a/hybridbackend/tensorflow/keras/model.py b/hybridbackend/tensorflow/keras/model.py index 29363295..d73b4949 100644 --- a/hybridbackend/tensorflow/keras/model.py +++ b/hybridbackend/tensorflow/keras/model.py @@ -473,9 +473,9 @@ class HybridBackendKerasModel(cls, HybridBackendKerasModelBase): ''' def __init__(self, *args, **kwargs): self._device_fn = device_function - self._train_drop_remainder = kwargs.pop('train_drop_remainder', None) - self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', None) - self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', None) + self._train_drop_remainder = kwargs.pop('train_drop_remainder', True) + self._eval_drop_remainder = kwargs.pop('eval_drop_remainder', True) + self._predict_drop_remainder = kwargs.pop('predict_drop_remainder', True) self._load_weights_dir = None self._load_weights_scope = None self._load_weights_skip_mismatched = True