Skip to content

Commit

Permalink
train.py: adding optional tensorboard
Browse files Browse the repository at this point in the history
  • Loading branch information
rafaelvalle committed May 31, 2019
1 parent 9168aea commit 4dc23ec
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):

def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
sigma, iters_per_checkpoint, batch_size, seed, fp16_run,
checkpoint_path):
checkpoint_path, with_tensorboard):
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
#=====START: ADDED FOR DISTRIBUTED======
Expand Down Expand Up @@ -107,6 +107,10 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,
os.chmod(output_directory, 0o775)
print("output directory", output_directory)

if with_tensorboard and rank == 0:
from tensorboardX import SummaryWriter
logger = SummaryWriter(os.path.join(output_directory, 'logs'))

model.train()
epoch_offset = max(0, int(iteration / len(train_loader)))
# ================ MAIN TRAINNIG LOOP! ===================
Expand All @@ -128,13 +132,15 @@ def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate,

if fp16_run:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
scaled_loss.backward()
else:
loss.backward()

optimizer.step()

print("{}:\t{:.9f}".format(iteration, reduced_loss))
if with_tensorboard and rank == 0:
logger.add_scalar('training_loss', reduced_loss, i)

if (iteration % iters_per_checkpoint == 0):
if rank == 0:
Expand Down

0 comments on commit 4dc23ec

Please sign in to comment.