diff --git a/doctr/utils/metrics.py b/doctr/utils/metrics.py index 09795d8f76..faea10a3ab 100644 --- a/doctr/utils/metrics.py +++ b/doctr/utils/metrics.py @@ -5,16 +5,14 @@ from typing import Dict, List, Optional, Tuple -import cv2 import numpy as np from anyascii import anyascii from scipy.optimize import linear_sum_assignment +from shapely.geometry import Polygon __all__ = [ "TextMatch", "box_iou", - "box_ioa", - "mask_iou", "polygon_iou", "nms", "LocalizationConfusion", @@ -158,66 +156,7 @@ def box_iou(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: return iou_mat -def box_ioa(boxes_1: np.ndarray, boxes_2: np.ndarray) -> np.ndarray: - """Computes the IoA (intersection over area) between two sets of bounding boxes: - ioa(i, j) = inter(i, j) / area(i) - - Args: - ---- - boxes_1: bounding boxes of shape (N, 4) in format (xmin, ymin, xmax, ymax) - boxes_2: bounding boxes of shape (M, 4) in format (xmin, ymin, xmax, ymax) - - Returns: - ------- - the IoA matrix of shape (N, M) - """ - ioa_mat: np.ndarray = np.zeros((boxes_1.shape[0], boxes_2.shape[0]), dtype=np.float32) - - if boxes_1.shape[0] > 0 and boxes_2.shape[0] > 0: - l1, t1, r1, b1 = np.split(boxes_1, 4, axis=1) - l2, t2, r2, b2 = np.split(boxes_2, 4, axis=1) - - left = np.maximum(l1, l2.T) - top = np.maximum(t1, t2.T) - right = np.minimum(r1, r2.T) - bot = np.minimum(b1, b2.T) - - intersection = np.clip(right - left, 0, np.Inf) * np.clip(bot - top, 0, np.Inf) - area = (r1 - l1) * (b1 - t1) - ioa_mat = intersection / area - - return ioa_mat - - -def mask_iou(masks_1: np.ndarray, masks_2: np.ndarray) -> np.ndarray: - """Computes the IoU between two sets of boolean masks - - Args: - ---- - masks_1: boolean masks of shape (N, H, W) - masks_2: boolean masks of shape (M, H, W) - - Returns: - ------- - the IoU matrix of shape (N, M) - """ - if masks_1.shape[1:] != masks_2.shape[1:]: - raise AssertionError("both boolean masks should have the same spatial shape") - - iou_mat: np.ndarray = np.zeros((masks_1.shape[0], masks_2.shape[0]), dtype=np.float32) - - if masks_1.shape[0] > 0 and masks_2.shape[0] > 0: - axes = tuple(range(2, masks_1.ndim + 1)) - intersection = np.logical_and(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes) - union = np.logical_or(masks_1[:, None, ...], masks_2[None, ...]).sum(axis=axes) - iou_mat = intersection / union - - return iou_mat - - -def polygon_iou( - polys_1: np.ndarray, polys_2: np.ndarray, mask_shape: Tuple[int, int], use_broadcasting: bool = False -) -> np.ndarray: +def polygon_iou(polys_1: np.ndarray, polys_2: np.ndarray) -> np.ndarray: """Computes the IoU between two sets of rotated bounding boxes Args: @@ -234,80 +173,18 @@ def polygon_iou( if polys_1.ndim != 3 or polys_2.ndim != 3: raise AssertionError("expects boxes to be in format (N, 4, 2)") - iou_mat: np.ndarray = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32) - - if polys_1.shape[0] > 0 and polys_2.shape[0] > 0: - if use_broadcasting: - masks_1 = rbox_to_mask(polys_1, shape=mask_shape) - masks_2 = rbox_to_mask(polys_2, shape=mask_shape) - iou_mat = mask_iou(masks_1, masks_2) - else: - # Save memory by doing the computation for each pair - for idx, b1 in enumerate(polys_1): - m1 = _rbox_to_mask(b1, mask_shape) - for _idx, b2 in enumerate(polys_2): - m2 = _rbox_to_mask(b2, mask_shape) - iou_mat[idx, _idx] = np.logical_and(m1, m2).sum() / np.logical_or(m1, m2).sum() - - return iou_mat + iou_mat = np.zeros((polys_1.shape[0], polys_2.shape[0]), dtype=np.float32) + shapely_polys_1 = [Polygon(poly) for poly in polys_1] + shapely_polys_2 = [Polygon(poly) for poly in polys_2] -def _rbox_to_mask(box: np.ndarray, shape: Tuple[int, int]) -> np.ndarray: - """Converts a rotated bounding box to a boolean mask + for i, poly1 in enumerate(shapely_polys_1): + for j, poly2 in enumerate(shapely_polys_2): + intersection_area = poly1.intersection(poly2).area + union_area = poly1.area + poly2.area - intersection_area + iou_mat[i, j] = intersection_area / union_area - Args: - ---- - box: rotated bounding box of shape (4, 2) - shape: spatial shapes of the output masks - - Returns: - ------- - the boolean mask of the specified shape - """ - mask: np.ndarray = np.zeros(shape, dtype=np.uint8) - # Get absolute coords - if not np.issubdtype(box.dtype, np.integer): - abs_box = box.copy() - abs_box[:, 0] = abs_box[:, 0] * shape[1] - abs_box[:, 1] = abs_box[:, 1] * shape[0] - abs_box = abs_box.round().astype(int) - else: - abs_box = box - abs_box[2:] = abs_box[2:] + 1 - cv2.fillPoly(mask, [abs_box - 1], 1.0) # type: ignore[call-overload] - - return mask.astype(bool) - - -def rbox_to_mask(boxes: np.ndarray, shape: Tuple[int, int]) -> np.ndarray: - """Converts rotated bounding boxes to boolean masks - - Args: - ---- - boxes: rotated bounding boxes of shape (N, 4, 2) - shape: spatial shapes of the output masks - - Returns: - ------- - the boolean masks of shape (N, H, W) - """ - masks: np.ndarray = np.zeros((boxes.shape[0], *shape), dtype=np.uint8) - - if boxes.shape[0] > 0: - # Get absolute coordinates - if not np.issubdtype(boxes.dtype, np.integer): - abs_boxes = boxes.copy() - abs_boxes[:, :, 0] = abs_boxes[:, :, 0] * shape[1] - abs_boxes[:, :, 1] = abs_boxes[:, :, 1] * shape[0] - abs_boxes = abs_boxes.round().astype(int) - else: - abs_boxes = boxes - abs_boxes[:, 2:] = abs_boxes[:, 2:] + 1 - - # TODO: optimize slicing to improve vectorization - for idx, _box in enumerate(abs_boxes): - cv2.fillPoly(masks[idx], [_box - 1], 1.0) # type: ignore[call-overload] - return masks.astype(bool) + return iou_mat def nms(boxes: np.ndarray, thresh: float = 0.5) -> List[int]: @@ -386,21 +263,15 @@ class LocalizationConfusion: ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, use_polygons: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh self.use_polygons = use_polygons - self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() def update(self, gts: np.ndarray, preds: np.ndarray) -> None: @@ -414,7 +285,7 @@ def update(self, gts: np.ndarray, preds: np.ndarray) -> None: if preds.shape[0] > 0: # Compute IoU if self.use_polygons: - iou_mat = polygon_iou(gts, preds, self.mask_shape, self.use_broadcasting) + iou_mat = polygon_iou(gts, preds) else: iou_mat = box_iou(gts, preds) self.tot_iou += float(iou_mat.max(axis=0).sum()) @@ -441,7 +312,7 @@ def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: precision = self.matches / self.num_preds if self.num_preds > 0 else None # mean IoU - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None return recall, precision, mean_iou @@ -492,21 +363,15 @@ class OCRMetric: ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, use_polygons: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh self.use_polygons = use_polygons - self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() def update( @@ -533,7 +398,7 @@ def update( # Compute IoU if pred_boxes.shape[0] > 0: if self.use_polygons: - iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting) + iou_mat = polygon_iou(gt_boxes, pred_boxes) else: iou_mat = box_iou(gt_boxes, pred_boxes) @@ -577,7 +442,7 @@ def summary(self) -> Tuple[Dict[str, Optional[float]], Dict[str, Optional[float] ) # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None return recall, precision, mean_iou @@ -631,21 +496,15 @@ class DetectionMetric: ---- iou_thresh: minimum IoU to consider a pair of prediction and ground truth as a match use_polygons: if set to True, predictions and targets will be expected to have rotated format - mask_shape: if use_polygons is True, describes the spatial shape of the image used - use_broadcasting: if use_polygons is True, use broadcasting for IoU computation by consuming more memory """ def __init__( self, iou_thresh: float = 0.5, use_polygons: bool = False, - mask_shape: Tuple[int, int] = (1024, 1024), - use_broadcasting: bool = True, ) -> None: self.iou_thresh = iou_thresh self.use_polygons = use_polygons - self.mask_shape = mask_shape - self.use_broadcasting = use_broadcasting self.reset() def update( @@ -672,7 +531,7 @@ def update( # Compute IoU if pred_boxes.shape[0] > 0: if self.use_polygons: - iou_mat = polygon_iou(gt_boxes, pred_boxes, self.mask_shape, self.use_broadcasting) + iou_mat = polygon_iou(gt_boxes, pred_boxes) else: iou_mat = box_iou(gt_boxes, pred_boxes) @@ -701,7 +560,7 @@ def summary(self) -> Tuple[Optional[float], Optional[float], Optional[float]]: precision = self.num_matches / self.num_preds if self.num_preds > 0 else None # mean IoU (overall detected boxes) - mean_iou = self.tot_iou / self.num_preds if self.num_preds > 0 else None + mean_iou = round(self.tot_iou / self.num_preds, 2) if self.num_preds > 0 else None return recall, precision, mean_iou diff --git a/references/detection/evaluate_pytorch.py b/references/detection/evaluate_pytorch.py index 95f821da29..15f60df664 100644 --- a/references/detection/evaluate_pytorch.py +++ b/references/detection/evaluate_pytorch.py @@ -14,7 +14,6 @@ import time from pathlib import Path -import psutil import torch from torch.utils.data import DataLoader, SequentialSampler from torchvision.transforms import Normalize @@ -66,7 +65,6 @@ def main(args): args.workers = min(16, mp.cpu_count()) torch.backends.cudnn.benchmark = True - system_available_memory = int(psutil.virtual_memory().available / 1024**3) # Load docTR model model = detection.__dict__[args.arch]( @@ -134,11 +132,7 @@ def main(args): model = model.cuda() # Metrics - metric = LocalizationConfusion( - use_polygons=args.rotation, - mask_shape=input_shape, - use_broadcasting=True if system_available_memory > 62 else False, - ) + metric = LocalizationConfusion(use_polygons=args.rotation) print("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, test_loader, batch_transforms, metric, amp=args.amp) diff --git a/references/detection/evaluate_tensorflow.py b/references/detection/evaluate_tensorflow.py index 688d1271f6..74dae5604c 100644 --- a/references/detection/evaluate_tensorflow.py +++ b/references/detection/evaluate_tensorflow.py @@ -14,7 +14,6 @@ import time from pathlib import Path -import psutil import tensorflow as tf from tensorflow.keras import mixed_precision from tqdm import tqdm @@ -60,8 +59,6 @@ def main(args): if not isinstance(args.workers, int): args.workers = min(16, mp.cpu_count()) - system_available_memory = int(psutil.virtual_memory().available / 1024**3) - # AMP if args.amp: mixed_precision.set_global_policy("mixed_float16") @@ -115,11 +112,7 @@ def main(args): batch_transforms = T.Normalize(mean=mean, std=std) # Metrics - metric = LocalizationConfusion( - use_polygons=args.rotation, - mask_shape=input_shape[:2], - use_broadcasting=True if system_available_memory > 62 else False, - ) + metric = LocalizationConfusion(use_polygons=args.rotation) print("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, test_loader, batch_transforms, metric) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 4f64011518..aa0ca71f88 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -14,7 +14,6 @@ import time import numpy as np -import psutil import torch import wandb from torch.optim.lr_scheduler import CosineAnnealingLR, MultiplicativeLR, OneCycleLR, PolynomialLR @@ -178,7 +177,6 @@ def main(args): args.workers = min(16, mp.cpu_count()) torch.backends.cudnn.benchmark = True - system_available_memory = int(psutil.virtual_memory().available / 1024**3) st = time.time() val_set = DetectionDataset( @@ -246,11 +244,7 @@ def main(args): model = model.cuda() # Metrics - val_metric = LocalizationConfusion( - use_polygons=args.rotation and not args.eval_straight, - mask_shape=(args.input_size, args.input_size), - use_broadcasting=True if system_available_memory > 62 else False, - ) + val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if args.test_only: print("Running evaluation") diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 05ee7c890e..fa1be715b2 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -14,7 +14,6 @@ import time import numpy as np -import psutil import tensorflow as tf from tensorflow.keras import mixed_precision from tqdm.auto import tqdm @@ -136,8 +135,6 @@ def main(args): if not isinstance(args.workers, int): args.workers = min(16, mp.cpu_count()) - system_available_memory = int(psutil.virtual_memory().available / 1024**3) - # AMP if args.amp: mixed_precision.set_global_policy("mixed_float16") @@ -200,11 +197,8 @@ def main(args): print("Done.") # Metrics - val_metric = LocalizationConfusion( - use_polygons=args.rotation and not args.eval_straight, - mask_shape=(args.input_size, args.input_size), - use_broadcasting=True if system_available_memory > 62 else False, - ) + val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) + if args.test_only: print("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) diff --git a/references/requirements.txt b/references/requirements.txt index 679df79518..5ded75a3b8 100644 --- a/references/requirements.txt +++ b/references/requirements.txt @@ -1,6 +1,5 @@ -e . tqdm wandb>=0.10.31 -psutil>=5.9.0 clearml>=1.11.1 matplotlib>=3.1.0 diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 7dc4eadbff..bc9459b727 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -56,16 +56,9 @@ def main(args): sets = [train_set, val_set] reco_metric = TextMatch() - if args.mask_shape: - det_metric = LocalizationConfusion( - iou_thresh=args.iou, use_polygons=not args.eval_straight, mask_shape=(args.mask_shape, args.mask_shape) - ) - e2e_metric = OCRMetric( - iou_thresh=args.iou, use_polygons=not args.eval_straight, mask_shape=(args.mask_shape, args.mask_shape) - ) - else: - det_metric = LocalizationConfusion(iou_thresh=args.iou, use_polygons=not args.eval_straight) - e2e_metric = OCRMetric(iou_thresh=args.iou, use_polygons=not args.eval_straight) + + det_metric = LocalizationConfusion(iou_thresh=args.iou, use_polygons=not args.eval_straight) + e2e_metric = OCRMetric(iou_thresh=args.iou, use_polygons=not args.eval_straight) sample_idx = 0 extraction_fn = extract_crops if args.eval_straight else extract_rcrops @@ -197,7 +190,6 @@ def parse_args(): parser.add_argument("--label_file", type=str, default=None, help="Only for local sets, path to labels") parser.add_argument("--rotation", dest="rotation", action="store_true", help="run rotated OCR + postprocessing") parser.add_argument("-b", "--batch_size", type=int, default=32, help="batch size for recognition") - parser.add_argument("--mask_shape", type=int, default=None, help="mask shape for mask iou (only for rotation)") parser.add_argument("--samples", type=int, default=None, help="evaluate only on the N first samples") parser.add_argument( "--eval-straight", diff --git a/scripts/evaluate_kie.py b/scripts/evaluate_kie.py index de624c214d..b3d75d9beb 100644 --- a/scripts/evaluate_kie.py +++ b/scripts/evaluate_kie.py @@ -58,16 +58,9 @@ def main(args): sets = [train_set, val_set] reco_metric = TextMatch() - if args.mask_shape: - det_metric = LocalizationConfusion( - iou_thresh=args.iou, use_polygons=not args.eval_straight, mask_shape=(args.mask_shape, args.mask_shape) - ) - e2e_metric = OCRMetric( - iou_thresh=args.iou, use_polygons=not args.eval_straight, mask_shape=(args.mask_shape, args.mask_shape) - ) - else: - det_metric = LocalizationConfusion(iou_thresh=args.iou, use_polygons=not args.eval_straight) - e2e_metric = OCRMetric(iou_thresh=args.iou, use_polygons=not args.eval_straight) + + det_metric = LocalizationConfusion(iou_thresh=args.iou, use_polygons=not args.eval_straight) + e2e_metric = OCRMetric(iou_thresh=args.iou, use_polygons=not args.eval_straight) sample_idx = 0 extraction_fn = extract_crops if args.eval_straight else extract_rcrops @@ -194,7 +187,6 @@ def parse_args(): parser.add_argument("--label_file", type=str, default=None, help="Only for local sets, path to labels") parser.add_argument("--rotation", dest="rotation", action="store_true", help="run rotated OCR + postprocessing") parser.add_argument("-b", "--batch_size", type=int, default=32, help="batch size for recognition") - parser.add_argument("--mask_shape", type=int, default=None, help="mask shape for mask iou (only for rotation)") parser.add_argument("--samples", type=int, default=None, help="evaluate only on the N first samples") parser.add_argument( "--eval-straight", diff --git a/tests/common/test_utils_metrics.py b/tests/common/test_utils_metrics.py index 1d83fca962..2c01682ab6 100644 --- a/tests/common/test_utils_metrics.py +++ b/tests/common/test_utils_metrics.py @@ -45,34 +45,6 @@ def test_box_iou(box1, box2, iou, abs_tol): assert abs(iou_mat - iou) <= abs_tol -@pytest.mark.parametrize( - "mask1, mask2, iou, abs_tol", - [ - [ - [[[True, True, False], [True, True, False]]], - [[[True, True, False], [True, True, False]]], - 1, - 0, - ], # Perfect match - [ - [[[True, False, False], [False, False, False]]], - [[[True, True, False], [True, True, False]]], - 0.25, - 0, - ], # Partial match - ], -) -def test_mask_iou(mask1, mask2, iou, abs_tol): - iou_mat = metrics.mask_iou(np.asarray(mask1), np.asarray(mask2)) - assert iou_mat.shape == (len(mask1), len(mask2)) - if iou_mat.size > 0: - assert abs(iou_mat - iou) <= abs_tol - - # Incompatible spatial shapes - with pytest.raises(AssertionError): - metrics.mask_iou(np.zeros((2, 3, 5), dtype=bool), np.ones((3, 2, 5), dtype=bool)) - - @pytest.mark.parametrize( "rbox1, rbox2, iou, abs_tol", [ @@ -101,35 +73,18 @@ def test_mask_iou(mask1, mask2, iou, abs_tol): ], ) def test_polygon_iou(rbox1, rbox2, iou, abs_tol): - mask_shape = (256, 256) - iou_mat = metrics.polygon_iou(np.asarray(rbox1), np.asarray(rbox2), mask_shape) + iou_mat = metrics.polygon_iou(np.asarray(rbox1), np.asarray(rbox2)) assert iou_mat.shape == (len(rbox1), len(rbox2)) if iou_mat.size > 0: assert abs(iou_mat - iou) <= abs_tol # Ensure broadcasting doesn't change the result - iou_matbis = metrics.polygon_iou(np.asarray(rbox1), np.asarray(rbox2), mask_shape, use_broadcasting=False) + iou_matbis = metrics.polygon_iou(np.asarray(rbox1), np.asarray(rbox2)) assert np.all((iou_mat - iou_matbis) <= 1e-7) # Incorrect boxes with pytest.raises(AssertionError): - metrics.polygon_iou(np.zeros((2, 5), dtype=float), np.ones((3, 4), dtype=float), mask_shape) - - -@pytest.mark.parametrize( - "box, shape, mask", - [ - [ - [[0, 0], [0.5, 0], [0.5, 0.5], [0, 0.5]], - (2, 2), - [[True, False], [False, False]], - ], - ], -) -def test_rbox_to_mask(box, shape, mask): - masks = metrics.rbox_to_mask(np.asarray(box)[None, ...], shape) - assert masks.shape == (1, *shape) - assert np.all(masks[0] == np.asarray(mask, dtype=bool)) + metrics.polygon_iou(np.zeros((2, 5), dtype=float), np.ones((3, 4), dtype=float)) @pytest.mark.parametrize( @@ -190,7 +145,7 @@ def test_localization_confusion(gts, preds, iou_thresh, recall, precision, mean_ ], ) def test_r_localization_confusion(gts, preds, iou_thresh, recall, precision, mean_iou): - metric = metrics.LocalizationConfusion(iou_thresh, use_polygons=True, mask_shape=(1000, 1000)) + metric = metrics.LocalizationConfusion(iou_thresh, use_polygons=True) for _gts, _preds in zip(gts, preds): metric.update(np.asarray(_gts), np.zeros((0, 5)) if _preds is None else np.asarray(_preds)) assert metric.summary()[:2] == (recall, precision) @@ -334,13 +289,3 @@ def test_nms(): ] to_keep = metrics.nms(np.asarray(boxes), thresh=0.2) assert to_keep == [0, 2] - - -def test_box_ioa(): - boxes = [ - [0.1, 0.1, 0.2, 0.2], - [0.15, 0.15, 0.2, 0.2], - ] - mat = metrics.box_ioa(np.array(boxes), np.array(boxes)) - assert mat[1, 0] == mat[0, 0] == mat[1, 1] == 1.0 - assert abs(mat[0, 1] - 0.25) <= 1e-7