From da8888008070954c00899940b1e8a6089fc107c1 Mon Sep 17 00:00:00 2001 From: Felix Dittrich Date: Tue, 1 Oct 2024 10:42:24 +0200 Subject: [PATCH] [Bug] Fix eval scripts + possible overflow in Resize (#1715) --- api/app/vision.py | 2 +- doctr/transforms/modules/pytorch.py | 13 +++---- doctr/transforms/modules/tensorflow.py | 26 ++++++++------ .../classification/latency_tensorflow.py | 2 +- .../train_tensorflow_character.py | 2 +- .../train_tensorflow_orientation.py | 2 +- references/detection/evaluate_tensorflow.py | 2 +- references/detection/latency_tensorflow.py | 2 +- references/detection/train_tensorflow.py | 2 +- references/recognition/evaluate_tensorflow.py | 2 +- references/recognition/latency_tensorflow.py | 2 +- references/recognition/train_tensorflow.py | 2 +- scripts/analyze.py | 2 +- scripts/detect_text.py | 2 +- scripts/evaluate.py | 35 ++++++++++++++++--- scripts/evaluate_kie.py | 35 ++++++++++++++++--- tests/pytorch/test_transforms_pt.py | 16 +++++++++ tests/tensorflow/test_transforms_tf.py | 16 +++++++++ 18 files changed, 128 insertions(+), 37 deletions(-) diff --git a/api/app/vision.py b/api/app/vision.py index 005c8d1548..144b5e4c3b 100644 --- a/api/app/vision.py +++ b/api/app/vision.py @@ -6,7 +6,7 @@ import tensorflow as tf -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index f893afc2f7..639b27e2cf 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -74,16 +74,18 @@ def forward( if self.symmetric_pad: half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2)) _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1]) + # Pad image img = pad(img, _pad) # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] + if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): if isinstance(self.size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1] target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2] else: @@ -91,16 +93,15 @@ def forward( target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2] elif target.shape[1:] == (4, 2): if isinstance(self.size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1] target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2] else: target[..., 0] *= raw_shape[-1] / img.shape[-1] target[..., 1] *= raw_shape[-2] / img.shape[-2] else: - raise AssertionError - return img, target + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return img, np.clip(target, 0, 1) return img diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index b3f7bcfd8a..4b00a9359f 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -107,29 +107,34 @@ def __call__( target: Optional[np.ndarray] = None, ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: if isinstance(self.output_size, (tuple, list)): # In that case we need to pad because we want to enforce both width and height if not self.symmetric_pad: - offset = (0, 0) + half_pad = (0, 0) elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) + # Pad image + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: + if self.symmetric_pad: + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] + if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] else: @@ -137,16 +142,15 @@ def __call__( target[:, [1, 3]] *= raw_shape[0] / img.shape[0] elif target.shape[1:] == (4, 2): if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] else: target[..., 0] *= raw_shape[1] / img.shape[1] target[..., 1] *= raw_shape[0] / img.shape[0] else: - raise AssertionError - return tf.cast(img, dtype=input_dtype), target + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") + + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) return tf.cast(img, dtype=input_dtype) diff --git a/references/classification/latency_tensorflow.py b/references/classification/latency_tensorflow.py index fc010df91a..6ccdefac18 100644 --- a/references/classification/latency_tensorflow.py +++ b/references/classification/latency_tensorflow.py @@ -20,7 +20,7 @@ def main(args): if args.gpu: - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: diff --git a/references/classification/train_tensorflow_character.py b/references/classification/train_tensorflow_character.py index b2d24f2dbf..89d0165d90 100644 --- a/references/classification/train_tensorflow_character.py +++ b/references/classification/train_tensorflow_character.py @@ -18,7 +18,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index e063174944..a7d3b96943 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -18,7 +18,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/detection/evaluate_tensorflow.py b/references/detection/evaluate_tensorflow.py index 4eef9a40b7..76bd29b59a 100644 --- a/references/detection/evaluate_tensorflow.py +++ b/references/detection/evaluate_tensorflow.py @@ -17,7 +17,7 @@ from keras import mixed_precision from tqdm import tqdm -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/detection/latency_tensorflow.py b/references/detection/latency_tensorflow.py index e3e0d1d8af..39c0cd6e36 100644 --- a/references/detection/latency_tensorflow.py +++ b/references/detection/latency_tensorflow.py @@ -20,7 +20,7 @@ def main(args): if args.gpu: - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index b9c14494ad..5e71909f3d 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -19,7 +19,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/recognition/evaluate_tensorflow.py b/references/recognition/evaluate_tensorflow.py index 4c9d125285..9fea4f02ed 100644 --- a/references/recognition/evaluate_tensorflow.py +++ b/references/recognition/evaluate_tensorflow.py @@ -14,7 +14,7 @@ from keras import mixed_precision from tqdm import tqdm -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/recognition/latency_tensorflow.py b/references/recognition/latency_tensorflow.py index 405cf56892..318ff03fcb 100644 --- a/references/recognition/latency_tensorflow.py +++ b/references/recognition/latency_tensorflow.py @@ -20,7 +20,7 @@ def main(args): if args.gpu: - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: diff --git a/references/recognition/train_tensorflow.py b/references/recognition/train_tensorflow.py index c76355a2f2..ca04cb1200 100644 --- a/references/recognition/train_tensorflow.py +++ b/references/recognition/train_tensorflow.py @@ -20,7 +20,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/scripts/analyze.py b/scripts/analyze.py index 94415267a2..fdffa30e48 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -16,7 +16,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/scripts/detect_text.py b/scripts/detect_text.py index f65b6685df..e3ca08c7b0 100644 --- a/scripts/detect_text.py +++ b/scripts/detect_text.py @@ -20,7 +20,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index bc9459b727..86dbc0e561 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -11,6 +11,7 @@ from tqdm import tqdm from doctr import datasets +from doctr import transforms as T from doctr.file_utils import is_tf_available from doctr.models import ocr_predictor from doctr.utils.geometry import extract_crops, extract_rcrops @@ -20,7 +21,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: @@ -35,12 +36,24 @@ def main(args): if not args.rotation: args.eval_straight = True + input_shape = (args.size, args.size) + + # We define a transformation function which does transform the annotation + # to the required format for the Resize transformation + def _transform(img, target): + boxes = target["boxes"] + transformed_img, transformed_boxes = T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + )(img, boxes) + return transformed_img, {"boxes": transformed_boxes, "labels": target["labels"]} + predictor = ocr_predictor( args.detection, args.recognition, pretrained=True, reco_bs=args.batch_size, - preserve_aspect_ratio=False, + preserve_aspect_ratio=False, # we handle the transformation directly in the dataset so this is set to False + symmetric_pad=False, # we handle the transformation directly in the dataset so this is set to False assume_straight_pages=not args.rotation, ) @@ -48,11 +61,22 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, + sample_transforms=_transform, ) sets = [testset] else: - train_set = datasets.__dict__[args.dataset](train=True, download=True, use_polygons=not args.eval_straight) - val_set = datasets.__dict__[args.dataset](train=False, download=True, use_polygons=not args.eval_straight) + train_set = datasets.__dict__[args.dataset]( + train=True, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=_transform, + ) + val_set = datasets.__dict__[args.dataset]( + train=False, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=_transform, + ) sets = [train_set, val_set] reco_metric = TextMatch() @@ -190,6 +214,9 @@ def parse_args(): parser.add_argument("--label_file", type=str, default=None, help="Only for local sets, path to labels") parser.add_argument("--rotation", dest="rotation", action="store_true", help="run rotated OCR + postprocessing") parser.add_argument("-b", "--batch_size", type=int, default=32, help="batch size for recognition") + parser.add_argument("--size", type=int, default=1024, help="model input size, H = W") + parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") + parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("--samples", type=int, default=None, help="evaluate only on the N first samples") parser.add_argument( "--eval-straight", diff --git a/scripts/evaluate_kie.py b/scripts/evaluate_kie.py index b3d75d9beb..ca17332e2c 100644 --- a/scripts/evaluate_kie.py +++ b/scripts/evaluate_kie.py @@ -13,6 +13,7 @@ from tqdm import tqdm from doctr import datasets +from doctr import transforms as T from doctr.file_utils import is_tf_available from doctr.models import kie_predictor from doctr.utils.geometry import extract_crops, extract_rcrops @@ -22,7 +23,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: @@ -37,12 +38,24 @@ def main(args): if not args.rotation: args.eval_straight = True + input_shape = (args.size, args.size) + + # We define a transformation function which does transform the annotation + # to the required format for the Resize transformation + def _transform(img, target): + boxes = target["boxes"] + transformed_img, transformed_boxes = T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + )(img, boxes) + return transformed_img, {"boxes": transformed_boxes, "labels": target["labels"]} + predictor = kie_predictor( args.detection, args.recognition, pretrained=True, reco_bs=args.batch_size, - preserve_aspect_ratio=False, + preserve_aspect_ratio=False, # we handle the transformation directly in the dataset so this is set to False + symmetric_pad=False, # we handle the transformation directly in the dataset so this is set to False assume_straight_pages=not args.rotation, ) @@ -50,11 +63,22 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, + sample_transforms=_transform, ) sets = [testset] else: - train_set = datasets.__dict__[args.dataset](train=True, download=True, use_polygons=not args.eval_straight) - val_set = datasets.__dict__[args.dataset](train=False, download=True, use_polygons=not args.eval_straight) + train_set = datasets.__dict__[args.dataset]( + train=True, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=_transform, + ) + val_set = datasets.__dict__[args.dataset]( + train=False, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=_transform, + ) sets = [train_set, val_set] reco_metric = TextMatch() @@ -187,6 +211,9 @@ def parse_args(): parser.add_argument("--label_file", type=str, default=None, help="Only for local sets, path to labels") parser.add_argument("--rotation", dest="rotation", action="store_true", help="run rotated OCR + postprocessing") parser.add_argument("-b", "--batch_size", type=int, default=32, help="batch size for recognition") + parser.add_argument("--size", type=int, default=1024, help="model input size, H = W") + parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") + parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("--samples", type=int, default=None, help="evaluate only on the N first samples") parser.add_argument( "--eval-straight", diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py index 2567dd8486..3c11412556 100644 --- a/tests/pytorch/test_transforms_pt.py +++ b/tests/pytorch/test_transforms_pt.py @@ -66,6 +66,22 @@ def test_resize(): out = transfo(input_t) assert out.dtype == torch.float16 + # --- Test with target (bounding boxes) --- + + target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) + output_size = (64, 64) + + transfo = Resize(output_size, preserve_aspect_ratio=True) + input_t = torch.ones((3, 32, 64), dtype=torch.float32) + out, new_target = transfo(input_t, target_boxes) + + assert out.shape[-2:] == output_size + assert new_target.shape == target_boxes.shape + assert np.all(new_target >= 0) and np.all(new_target <= 1) + + out = transfo(input_t) + assert out.shape[-2:] == output_size + @pytest.mark.parametrize( "rgb_min", diff --git a/tests/tensorflow/test_transforms_tf.py b/tests/tensorflow/test_transforms_tf.py index e53945f2e3..5fa87eab8a 100644 --- a/tests/tensorflow/test_transforms_tf.py +++ b/tests/tensorflow/test_transforms_tf.py @@ -48,6 +48,22 @@ def test_resize(): out = transfo(input_t) assert out.dtype == tf.float16 + # --- Test with target (bounding boxes) --- + + target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) + output_size = (64, 64) + + transfo = T.Resize(output_size, preserve_aspect_ratio=True) + input_t = tf.cast(tf.fill([64, 32, 3], 1), dtype=tf.float32) + out, new_target = transfo(input_t, target_boxes) + + assert out.shape[:2] == output_size + assert new_target.shape == target_boxes.shape + assert np.all(new_target >= 0) and np.all(new_target <= 1) + + out = transfo(input_t) + assert out.shape[:2] == output_size + def test_compose(): output_size = (16, 16)