diff --git a/references/classification/train_pytorch_character.py b/references/classification/train_pytorch_character.py index 5c6246bfc..2424a81c0 100644 --- a/references/classification/train_pytorch_character.py +++ b/references/classification/train_pytorch_character.py @@ -117,7 +117,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0.0, 0.0 - pbar = tqdm(train_loader, position=1) + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -157,7 +157,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): model.eval() # Validation loop val_loss, correct, samples, batch_cnt = 0, 0, 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) if torch.cuda.is_available(): @@ -174,6 +175,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): # Compute metric correct += (out.argmax(dim=1) == targets).sum().item() + pbar.set_description(f"Validation loss: {loss.item():.6}") + val_loss += loss.item() batch_cnt += 1 samples += images.shape[0] @@ -184,7 +187,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -219,7 +223,7 @@ def main(args): sampler=SequentialSampler(val_set), pin_memory=torch.cuda.is_available(), ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -228,7 +232,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -248,9 +252,9 @@ def main(args): model = model.cuda() if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -283,7 +287,7 @@ def main(args): sampler=RandomSampler(train_set), pin_memory=torch.cuda.is_available(), ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if args.show_samples: x, target = next(iter(train_loader)) @@ -364,14 +368,15 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: @@ -393,7 +398,7 @@ def main(args): logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if args.wb: @@ -403,11 +408,11 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") dummy_batch = next(iter(val_loader)) dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0] model_path = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index b96766858..49b908540 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -128,7 +128,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0.0, 0.0 - pbar = tqdm(train_loader, position=1) + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -168,7 +168,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): model.eval() # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) if torch.cuda.is_available(): @@ -185,6 +186,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): # Compute metric correct += (out.argmax(dim=1) == targets).sum().item() + pbar.set_description(f"Validation loss: {loss.item():.6}") + val_loss += loss.item() batch_cnt += 1 samples += images.shape[0] @@ -195,7 +198,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -227,7 +231,7 @@ def main(args): sampler=SequentialSampler(val_set), pin_memory=torch.cuda.is_available(), ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -236,7 +240,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -256,9 +260,9 @@ def main(args): model = model.cuda() if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -289,7 +293,7 @@ def main(args): sampler=RandomSampler(train_set), pin_memory=torch.cuda.is_available(), ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if args.show_samples: x, target = next(iter(train_loader)) @@ -370,14 +374,15 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: @@ -399,7 +404,7 @@ def main(args): logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if args.wb: @@ -409,11 +414,11 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") dummy_batch = next(iter(val_loader)) dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0] model_path = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/classification/train_tensorflow_character.py b/references/classification/train_tensorflow_character.py index 6089ba2a8..ec67ab73d 100644 --- a/references/classification/train_tensorflow_character.py +++ b/references/classification/train_tensorflow_character.py @@ -100,7 +100,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_iter, position=1) + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -127,13 +127,16 @@ def evaluate(model, val_loader, batch_transforms): # Validation loop val_loss, correct, samples, batch_cnt = 0, 0, 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, training=False) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) # Compute metric correct += int((out.numpy().argmax(1) == targets.numpy()).sum()) + pbar.set_description(f"Validation loss: {loss.numpy().mean():.6}") + val_loss += loss.numpy().mean() batch_cnt += 1 samples += images.shape[0] @@ -151,7 +154,8 @@ def collate_fn(samples): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -184,7 +188,7 @@ def main(args): drop_last=False, collate_fn=collate_fn, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) @@ -206,9 +210,9 @@ def main(args): ]) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -239,7 +243,7 @@ def main(args): drop_last=True, collate_fn=collate_fn, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) @@ -334,14 +338,15 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: @@ -363,7 +368,7 @@ def main(args): logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if args.wb: @@ -373,7 +378,7 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") if args.arch == "vit_b": # fixed batch size for vit dummy_input = [tf.TensorSpec([1, args.input_size, args.input_size, 3], tf.float32, name="input")] @@ -381,7 +386,7 @@ def main(args): # dynamic batch size dummy_input = [tf.TensorSpec([None, args.input_size, args.input_size, 3], tf.float32, name="input")] model_path, _ = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index c8bcd07cc..72be3eb83 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -114,7 +114,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_iter, position=1) + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -141,13 +141,16 @@ def evaluate(model, val_loader, batch_transforms): # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, training=False) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) # Compute metric correct += int((out.numpy().argmax(1) == targets.numpy()).sum()) + pbar.set_description(f"Validation loss: {loss.numpy().mean():.6}") + val_loss += loss.numpy().mean() batch_cnt += 1 samples += images.shape[0] @@ -165,7 +168,8 @@ def collate_fn(samples): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -195,7 +199,7 @@ def main(args): drop_last=False, collate_fn=collate_fn, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) @@ -217,9 +221,9 @@ def main(args): ]) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -249,7 +253,7 @@ def main(args): drop_last=True, collate_fn=collate_fn, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) @@ -343,14 +347,15 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, acc = evaluate(model, val_loader, batch_transforms) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") # W&B if args.wb: @@ -372,7 +377,7 @@ def main(args): logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if args.wb: @@ -382,7 +387,7 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") if args.arch in ["vit_s", "vit_b"]: # fixed batch size for vit dummy_input = [tf.TensorSpec([1, *(input_size), 3], tf.float32, name="input")] @@ -390,7 +395,7 @@ def main(args): # dynamic batch size dummy_input = [tf.TensorSpec([None, *(input_size), 3], tf.float32, name="input")] model_path, _ = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index dc2244f9b..7fa9d37d5 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -110,7 +110,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_loader, position=1) + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -152,7 +152,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -170,6 +171,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + val_loss += out["loss"].item() batch_cnt += 1 @@ -179,7 +182,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -220,7 +224,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -235,7 +239,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -258,9 +262,9 @@ def main(args): val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | " f"Mean IoU: {mean_iou:.2%})" ) @@ -327,7 +331,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -418,21 +422,22 @@ def main(args): # Training loop for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss if args.save_interval_epoch: - print(f"Saving state at epoch: {epoch + 1}") + pbar.write(f"Saving state at epoch: {epoch + 1}") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}_epoch{epoch + 1}.pt") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" - print(log_msg) + pbar.write(log_msg) # W&B if args.wb: wandb.log({ @@ -457,8 +462,9 @@ def main(args): logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() diff --git a/references/detection/train_pytorch_ddp.py b/references/detection/train_pytorch_ddp.py index cf925eeb6..85607c2a7 100644 --- a/references/detection/train_pytorch_ddp.py +++ b/references/detection/train_pytorch_ddp.py @@ -116,7 +116,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_loader, position=1) + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -158,7 +158,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, args, amp=False): val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -176,6 +177,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, args, amp=False): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + val_loss += out["loss"].item() batch_cnt += 1 @@ -191,8 +194,8 @@ def main(rank: int, world_size: int, args): world_size (int): number of processes participating in the job args: other arguments passed through the CLI """ - - print(args) + pbar = tqdm(disable=True) + pbar.write(args) if rank == 0 and args.push_to_hub: login_to_hub() @@ -235,7 +238,9 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)" + ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -254,7 +259,7 @@ def main(rank: int, world_size: int, args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -271,11 +276,11 @@ def main(rank: int, world_size: int, args): val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if rank == 0 and args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate( model, val_loader, batch_transforms, val_metric, args, amp=args.amp ) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | " f"Mean IoU: {mean_iou:.2%})" ) @@ -342,7 +347,7 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -434,6 +439,7 @@ def main(rank: int, world_size: int, args): # Training loop for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") if rank == 0: # Validation loop at the end of each epoch @@ -441,18 +447,18 @@ def main(rank: int, world_size: int, args): model, val_loader, batch_transforms, val_metric, args, amp=args.amp ) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.module.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss if args.save_interval_epoch: - print(f"Saving state at epoch: {epoch + 1}") + pbar.write(f"Saving state at epoch: {epoch + 1}") torch.save(model.module.state_dict(), Path(args.output_dir) / f"{exp_name}_epoch{epoch + 1}.pt") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" - print(log_msg) + pbar.write(log_msg) # W&B if args.wb: wandb.log({ @@ -477,7 +483,7 @@ def main(rank: int, world_size: int, args): logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if rank == 0: diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 64534d136..ed2ad2125 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -100,7 +100,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_iter, position=1) + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -128,7 +128,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): # Validation loop val_loss, batch_cnt = 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, target=targets, training=False, return_preds=True) # Compute metric @@ -140,6 +141,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + pbar.set_description(f"Validation loss: {out['loss'].numpy():.6}") + val_loss += out["loss"].numpy() batch_cnt += 1 @@ -149,7 +152,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -186,7 +190,7 @@ def main(args): shuffle=False, drop_last=False, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: @@ -212,9 +216,9 @@ def main(args): val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | " f"Mean IoU: {mean_iou:.2%})" ) @@ -281,7 +285,7 @@ def main(args): shuffle=True, drop_last=True, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: @@ -384,21 +388,23 @@ def main(args): # Training loop for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") + # Validation loop at the end of each epoch val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss if args.save_interval_epoch: - print(f"Saving state at epoch: {epoch + 1}") + pbar.write(f"Saving state at epoch: {epoch + 1}") model.save_weights(Path(args.output_dir) / f"{exp_name}_{epoch + 1}.weights.h5") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" - print(log_msg) + pbar.write(log_msg) # W&B if args.wb: wandb.log({ @@ -423,7 +429,7 @@ def main(args): logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if args.wb: run.finish() diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py index 4498cd3db..6d937a7dd 100644 --- a/references/recognition/train_pytorch.py +++ b/references/recognition/train_pytorch.py @@ -116,7 +116,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a model.train() # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_loader, position=1) + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -160,7 +160,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -176,6 +177,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): words = [] val_metric.update(targets, words) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + val_loss += out["loss"].item() batch_cnt += 1 @@ -185,7 +188,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -234,7 +238,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -243,7 +247,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -266,9 +270,9 @@ def main(args): val_metric = TextMatch() if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") return st = time.time() @@ -335,7 +339,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if args.show_samples: x, target = next(iter(train_loader)) @@ -423,14 +427,15 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print( + pbar.write( f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " f"(Exact: {exact_match:.2%} | Partial: {partial_match:.2%})" ) @@ -456,8 +461,9 @@ def main(args): logger.report_scalar(title="Partial Match", series="partial_match", value=partial_match, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() diff --git a/references/recognition/train_pytorch_ddp.py b/references/recognition/train_pytorch_ddp.py index 62c573ebe..df3be466d 100644 --- a/references/recognition/train_pytorch_ddp.py +++ b/references/recognition/train_pytorch_ddp.py @@ -49,7 +49,7 @@ def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, sche model.train() # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_loader, position=1) + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: images = images.to(device) images = batch_transforms(images) @@ -92,7 +92,8 @@ def evaluate(model, device, val_loader, batch_transforms, val_metric, amp=False) val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: images = images.to(device) images = batch_transforms(images) if amp: @@ -107,6 +108,8 @@ def evaluate(model, device, val_loader, batch_transforms, val_metric, amp=False) words = [] val_metric.update(targets, words) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + val_loss += out["loss"].item() batch_cnt += 1 @@ -122,7 +125,8 @@ def main(rank: int, world_size: int, args): world_size (int): number of processes participating in the job args: other arguments passed through the CLI """ - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if rank == 0 and args.push_to_hub: login_to_hub() @@ -172,7 +176,9 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)" + ) batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -181,7 +187,7 @@ def main(rank: int, world_size: int, args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -203,11 +209,11 @@ def main(rank: int, world_size: int, args): val_metric = TextMatch() if rank == 0 and args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate( model, device, val_loader, batch_transforms, val_metric, amp=args.amp ) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") return st = time.time() @@ -274,7 +280,7 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if rank == 0 and args.show_samples: x, target = next(iter(train_loader)) @@ -353,6 +359,7 @@ def main(rank: int, world_size: int, args): train_loss, actual_lr = fit_one_epoch( model, device, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") if rank == 0: # Validation loop at the end of each epoch @@ -363,10 +370,10 @@ def main(rank: int, world_size: int, args): # All processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes. # Therefore, saving it in one process is sufficient. - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.module.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print( + pbar.write( f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " f"(Exact: {exact_match:.2%} | Partial: {partial_match:.2%})" ) @@ -394,8 +401,9 @@ def main(rank: int, world_size: int, args): ) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if rank == 0: if args.wb: run.finish() diff --git a/references/recognition/train_tensorflow.py b/references/recognition/train_tensorflow.py index a4ce1d706..6e723f71f 100644 --- a/references/recognition/train_tensorflow.py +++ b/references/recognition/train_tensorflow.py @@ -100,7 +100,7 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False): train_iter = iter(train_loader) # Iterate over the batches of the dataset epoch_train_loss, batch_cnt = 0, 0 - pbar = tqdm(train_iter, position=1) + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -128,7 +128,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): # Validation loop val_loss, batch_cnt = 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, target=targets, return_preds=True, training=False) # Compute metric @@ -138,6 +139,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): words = [] val_metric.update(targets, words) + pbar.set_description(f"Validation loss: {out['loss'].numpy().mean():.6}") + val_loss += out["loss"].numpy().mean() batch_cnt += 1 @@ -147,7 +150,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -193,7 +197,7 @@ def main(args): shuffle=False, drop_last=False, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) @@ -215,9 +219,9 @@ def main(args): ]) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") return st = time.time() @@ -285,7 +289,7 @@ def main(args): shuffle=True, drop_last=True, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) @@ -390,14 +394,15 @@ def main(args): # Training loop for epoch in range(args.epochs): train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss - print( + pbar.write( f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " f"(Exact: {exact_match:.2%} | Partial: {partial_match:.2%})" ) @@ -423,8 +428,9 @@ def main(args): logger.report_scalar(title="Partial Match", series="partial_match", value=partial_match, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish()