diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..5dd593f --- /dev/null +++ b/tools/README.md @@ -0,0 +1,52 @@ +We use ResNet and Swin as the backbone in our model. + +* `convert-torchvision-to-d2.py` + +Tool to convert torchvision pre-trained weights for D2. + +``` +wget https://download.pytorch.org/models/resnet101-63fe2227.pth +python tools/convert-torchvision-to-d2.py resnet101-63fe2227.pth R-101.pkl +``` + +* `convert-pretrained-swin-model-to-d2.py` + +Tool to convert Swin Transformer pre-trained weights for D2. + +``` +pip install timm + +wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth +python tools/convert-pretrained-swin-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl + +wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth +python tools/convert-pretrained-swin-model-to-d2.py swin_small_patch4_window7_224.pth swin_small_patch4_window7_224.pkl + +wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth +python tools/convert-pretrained-swin-model-to-d2.py swin_base_patch4_window12_384_22k.pth swin_base_patch4_window12_384_22k.pkl + +wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth +python tools/convert-pretrained-swin-model-to-d2.py swin_large_patch4_window12_384_22k.pth swin_large_patch4_window12_384_22k.pkl +``` + + +* `analyze_model.py` + +Tool to analyze model parameters and flops. + +Usage for semantic segmentation (ADE20K only, use with caution!): + +``` +python tools/analyze_model.py --num-inputs 1 --tasks flop --use-fixed-input-size --config-file CONFIG_FILE +``` + +Note that, for semantic segmentation (ADE20K only), we use a dummy image with fixed size that equals to `cfg.INPUT.CROP.SIZE[0] x cfg.INPUT.CROP.SIZE[0]`. +Please do not use `--use-fixed-input-size` for calculating FLOPs on other datasets like Cityscapes! + +Usage for panoptic and instance segmentation: + +``` +python tools/analyze_model.py --num-inputs 100 --tasks flop --config-file CONFIG_FILE +``` + +Note that, for panoptic and instance segmentation, we compute the average flops over 100 real validation images. diff --git a/tools/analyze_model.py b/tools/analyze_model.py new file mode 100644 index 0000000..a92227e --- /dev/null +++ b/tools/analyze_model.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detectron2/blob/main/tools/analyze_model.py + +import logging +import numpy as np +from collections import Counter +import tqdm +from fvcore.nn import flop_count_table # can also try flop_count_str + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import CfgNode, LazyConfig, get_cfg, instantiate +from detectron2.data import build_detection_test_loader +from detectron2.engine import default_argument_parser +from detectron2.modeling import build_model +from detectron2.projects.deeplab import add_deeplab_config +from detectron2.utils.analysis import ( + FlopCountAnalysis, + activation_count_operators, + parameter_count_table, +) +from detectron2.utils.logger import setup_logger + +# fmt: off +import os +import sys +sys.path.insert(1, os.path.join(sys.path[0], '..')) +# fmt: on + +from mask2former import add_maskformer2_config + +logger = logging.getLogger("detectron2") + + +def setup(args): + if args.config_file.endswith(".yaml"): + cfg = get_cfg() + add_deeplab_config(cfg) + add_maskformer2_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.DATALOADER.NUM_WORKERS = 0 + cfg.merge_from_list(args.opts) + cfg.freeze() + else: + cfg = LazyConfig.load(args.config_file) + cfg = LazyConfig.apply_overrides(cfg, args.opts) + setup_logger(name="fvcore") + setup_logger() + return cfg + + +def do_flop(cfg): + if isinstance(cfg, CfgNode): + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + model = build_model(cfg) + DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) + else: + data_loader = instantiate(cfg.dataloader.test) + model = instantiate(cfg.model) + model.to(cfg.train.device) + DetectionCheckpointer(model).load(cfg.train.init_checkpoint) + model.eval() + + counts = Counter() + total_flops = [] + for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa + if args.use_fixed_input_size and isinstance(cfg, CfgNode): + import torch + crop_size = cfg.INPUT.CROP.SIZE[0] + data[0]["image"] = torch.zeros((3, crop_size, crop_size)) + flops = FlopCountAnalysis(model, data) + if idx > 0: + flops.unsupported_ops_warnings(False).uncalled_modules_warnings(False) + counts += flops.by_operator() + total_flops.append(flops.total()) + + logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) + logger.info( + "Average GFlops for each type of operators:\n" + + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) + ) + logger.info( + "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) + ) + + +def do_activation(cfg): + if isinstance(cfg, CfgNode): + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + model = build_model(cfg) + DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) + else: + data_loader = instantiate(cfg.dataloader.test) + model = instantiate(cfg.model) + model.to(cfg.train.device) + DetectionCheckpointer(model).load(cfg.train.init_checkpoint) + model.eval() + + counts = Counter() + total_activations = [] + for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa + count = activation_count_operators(model, data) + counts += count + total_activations.append(sum(count.values())) + logger.info( + "(Million) Activations for Each Type of Operators:\n" + + str([(k, v / idx) for k, v in counts.items()]) + ) + logger.info( + "Total (Million) Activations: {}±{}".format( + np.mean(total_activations), np.std(total_activations) + ) + ) + + +def do_parameter(cfg): + if isinstance(cfg, CfgNode): + model = build_model(cfg) + else: + model = instantiate(cfg.model) + logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) + + +def do_structure(cfg): + if isinstance(cfg, CfgNode): + model = build_model(cfg) + else: + model = instantiate(cfg.model) + logger.info("Model Structure:\n" + str(model)) + + +if __name__ == "__main__": + parser = default_argument_parser( + epilog=""" +Examples: +To show parameters of a model: +$ ./analyze_model.py --tasks parameter \\ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml +Flops and activations are data-dependent, therefore inputs and model weights +are needed to count them: +$ ./analyze_model.py --num-inputs 100 --tasks flop \\ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ + MODEL.WEIGHTS /path/to/model.pkl +""" + ) + parser.add_argument( + "--tasks", + choices=["flop", "activation", "parameter", "structure"], + required=True, + nargs="+", + ) + parser.add_argument( + "-n", + "--num-inputs", + default=100, + type=int, + help="number of inputs used to compute statistics for flops/activations, " + "both are data dependent.", + ) + parser.add_argument( + "--use-fixed-input-size", + action="store_true", + help="use fixed input size when calculating flops", + ) + args = parser.parse_args() + assert not args.eval_only + assert args.num_gpus == 1 + + cfg = setup(args) + + for task in args.tasks: + { + "flop": do_flop, + "activation": do_activation, + "parameter": do_parameter, + "structure": do_structure, + }[task](cfg) diff --git a/tools/convert-pretrained-swin-model-to-d2.py b/tools/convert-pretrained-swin-model-to-d2.py new file mode 100644 index 0000000..8fbaeab --- /dev/null +++ b/tools/convert-pretrained-swin-model-to-d2.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pickle as pkl +import sys + +import torch + +""" +Usage: + # download pretrained swin model: + wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth + # run the conversion + ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl + # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: +MODEL: + WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" +INPUT: + FORMAT: "RGB" +""" + +if __name__ == "__main__": + input = sys.argv[1] + + obj = torch.load(input, map_location="cpu")["model"] + + res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} + + with open(sys.argv[2], "wb") as f: + pkl.dump(res, f) diff --git a/tools/convert-torchvision-to-d2.py b/tools/convert-torchvision-to-d2.py new file mode 100644 index 0000000..060390a --- /dev/null +++ b/tools/convert-torchvision-to-d2.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. + +import pickle as pkl +import sys + +import torch + +""" +Usage: + # download one of the ResNet{18,34,50,101,152} models from torchvision: + wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth + # run the conversion + ./convert-torchvision-to-d2.py r50.pth r50.pkl + # Then, use r50.pkl with the following changes in config: +MODEL: + WEIGHTS: "/path/to/r50.pkl" + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + RESNETS: + DEPTH: 50 + STRIDE_IN_1X1: False +INPUT: + FORMAT: "RGB" +""" + +if __name__ == "__main__": + input = sys.argv[1] + + obj = torch.load(input, map_location="cpu") + + newmodel = {} + for k in list(obj.keys()): + old_k = k + if "layer" not in k: + k = "stem." + k + for t in [1, 2, 3, 4]: + k = k.replace("layer{}".format(t), "res{}".format(t + 1)) + for t in [1, 2, 3]: + k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) + k = k.replace("downsample.0", "shortcut") + k = k.replace("downsample.1", "shortcut.norm") + print(old_k, "->", k) + newmodel[k] = obj.pop(old_k).detach().numpy() + + res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} + + with open(sys.argv[2], "wb") as f: + pkl.dump(res, f) + if obj: + print("Unconverted keys:", obj.keys()) diff --git a/tools/evaluate_coco_boundary_ap.py b/tools/evaluate_coco_boundary_ap.py new file mode 100644 index 0000000..1e96b5d --- /dev/null +++ b/tools/evaluate_coco_boundary_ap.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Modified by Bowen Cheng from: https://github.com/bowenc0221/boundary-iou-api/blob/master/tools/coco_instance_evaluation.py + +""" +Evaluation for COCO val2017: +python ./tools/coco_instance_evaluation.py \ + --gt-json-file COCO_GT_JSON \ + --dt-json-file COCO_DT_JSON +""" +import argparse +import json + +from boundary_iou.coco_instance_api.coco import COCO +from boundary_iou.coco_instance_api.cocoeval import COCOeval + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--gt-json-file", default="") + parser.add_argument("--dt-json-file", default="") + parser.add_argument("--iou-type", default="boundary") + parser.add_argument("--dilation-ratio", default="0.020", type=float) + args = parser.parse_args() + print(args) + + annFile = args.gt_json_file + resFile = args.dt_json_file + dilation_ratio = args.dilation_ratio + if args.iou_type == "boundary": + get_boundary = True + else: + get_boundary = False + cocoGt = COCO(annFile, get_boundary=get_boundary, dilation_ratio=dilation_ratio) + + # remove box predictions + resFile = json.load(open(resFile)) + for c in resFile: + c.pop("bbox", None) + + cocoDt = cocoGt.loadRes(resFile) + cocoEval = COCOeval(cocoGt, cocoDt, iouType=args.iou_type, dilation_ratio=dilation_ratio) + cocoEval.evaluate() + cocoEval.accumulate() + cocoEval.summarize() + + +if __name__ == '__main__': + main() diff --git a/tools/evaluate_pq_for_semantic_segmentation.py b/tools/evaluate_pq_for_semantic_segmentation.py new file mode 100644 index 0000000..d22d735 --- /dev/null +++ b/tools/evaluate_pq_for_semantic_segmentation.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. + +import argparse +import json +import os +from collections import defaultdict +from tqdm import tqdm + +import numpy as np +import torch + +from detectron2.data import MetadataCatalog +from detectron2.data.detection_utils import read_image +from detectron2.utils.file_io import PathManager +from pycocotools import mask as maskUtils + +from panopticapi.evaluation import PQStat + + +def default_argument_parser(): + """ + Creates a parser with some common arguments used by analysis tools. + Returns: + argparse.ArgumentParser: + """ + parser = argparse.ArgumentParser(description="Evaluate PQ metric for semantic segmentation.") + # NOTE: currently does not support Cityscapes, you need to convert + # Cityscapes prediction format to Detectron2 prediction format. + parser.add_argument( + "--dataset-name", + default="ade20k_sem_seg_val", + choices=["ade20k_sem_seg_val", "coco_2017_test_stuff_10k_sem_seg", "ade20k_full_sem_seg_val"], + help="dataset name you want to evaluate") + parser.add_argument("--json-file", default="", help="path to detection json file") + + return parser + + +# Modified from the official panoptic api: https://github.com/cocodataset/panopticapi/blob/master/panopticapi/evaluation.py +def pq_compute_single_image(segm_gt, segm_dt, categories, ignore_label): + pq_stat = PQStat() + VOID = ignore_label + OFFSET = 256 * 256 * 256 + + pan_gt = segm_gt + pan_pred = segm_dt + + gt_ann = {'segments_info': []} + labels, labels_cnt = np.unique(segm_gt, return_counts=True) + for cat_id, cnt in zip(labels, labels_cnt): + if cat_id == VOID: + continue + gt_ann['segments_info'].append( + {"id": cat_id, "category_id": cat_id, "area": cnt, "iscrowd": 0} + ) + + pred_ann = {'segments_info': []} + for cat_id in np.unique(segm_dt): + pred_ann['segments_info'].append({"id": cat_id, "category_id": cat_id}) + + gt_segms = {el['id']: el for el in gt_ann['segments_info']} + pred_segms = {el['id']: el for el in pred_ann['segments_info']} + + # predicted segments area calculation + prediction sanity checks + pred_labels_set = set(el['id'] for el in pred_ann['segments_info']) + labels, labels_cnt = np.unique(pan_pred, return_counts=True) + for label, label_cnt in zip(labels, labels_cnt): + if label not in pred_segms: + if label == VOID: + continue + raise KeyError('In the image with ID {} segment with ID {} is presented in PNG and not presented in JSON.'.format(image_id, label)) + pred_segms[label]['area'] = label_cnt + pred_labels_set.remove(label) + if pred_segms[label]['category_id'] not in categories: + raise KeyError('In the image with ID {} segment with ID {} has unknown category_id {}.'.format(image_id, label, pred_segms[label]['category_id'])) + if len(pred_labels_set) != 0: + raise KeyError('In the image with ID {} the following segment IDs {} are presented in JSON and not presented in PNG.'.format(image_id, list(pred_labels_set))) + + # confusion matrix calculation + pan_gt_pred = pan_gt.astype(np.uint64) * OFFSET + pan_pred.astype(np.uint64) + gt_pred_map = {} + labels, labels_cnt = np.unique(pan_gt_pred, return_counts=True) + for label, intersection in zip(labels, labels_cnt): + gt_id = label // OFFSET + pred_id = label % OFFSET + gt_pred_map[(gt_id, pred_id)] = intersection + + # count all matched pairs + gt_matched = set() + pred_matched = set() + for label_tuple, intersection in gt_pred_map.items(): + gt_label, pred_label = label_tuple + if gt_label not in gt_segms: + continue + if pred_label not in pred_segms: + continue + if gt_segms[gt_label]['iscrowd'] == 1: + continue + if gt_segms[gt_label]['category_id'] != pred_segms[pred_label]['category_id']: + continue + + union = pred_segms[pred_label]['area'] + gt_segms[gt_label]['area'] - intersection - gt_pred_map.get((VOID, pred_label), 0) + iou = intersection / union + if iou > 0.5: + pq_stat[gt_segms[gt_label]['category_id']].tp += 1 + pq_stat[gt_segms[gt_label]['category_id']].iou += iou + gt_matched.add(gt_label) + pred_matched.add(pred_label) + + # count false positives + crowd_labels_dict = {} + for gt_label, gt_info in gt_segms.items(): + if gt_label in gt_matched: + continue + # crowd segments are ignored + if gt_info['iscrowd'] == 1: + crowd_labels_dict[gt_info['category_id']] = gt_label + continue + pq_stat[gt_info['category_id']].fn += 1 + + # count false positives + for pred_label, pred_info in pred_segms.items(): + if pred_label in pred_matched: + continue + # intersection of the segment with VOID + intersection = gt_pred_map.get((VOID, pred_label), 0) + # plus intersection with corresponding CROWD region if it exists + if pred_info['category_id'] in crowd_labels_dict: + intersection += gt_pred_map.get((crowd_labels_dict[pred_info['category_id']], pred_label), 0) + # predicted segment is ignored if more than half of the segment correspond to VOID and CROWD regions + if intersection / pred_info['area'] > 0.5: + continue + pq_stat[pred_info['category_id']].fp += 1 + + return pq_stat + + +def main(): + parser = default_argument_parser() + args = parser.parse_args() + + _root = os.getenv("DETECTRON2_DATASETS", "datasets") + json_file = args.json_file + + with open(json_file) as f: + predictions = json.load(f) + + imgToAnns = defaultdict(list) + for pred in predictions: + image_id = os.path.basename(pred["file_name"]).split(".")[0] + imgToAnns[image_id].append( + {"category_id" : pred["category_id"], "segmentation" : pred["segmentation"]} + ) + + image_ids = list(imgToAnns.keys()) + + meta = MetadataCatalog.get(args.dataset_name) + class_names = meta.stuff_classes + num_classes = len(meta.stuff_classes) + ignore_label = meta.ignore_label + conf_matrix = np.zeros((num_classes + 1, num_classes + 1), dtype=np.int64) + + categories = {} + for i in range(num_classes): + categories[i] = {"id": i, "name": class_names[i], "isthing": 0} + + pq_stat = PQStat() + + for image_id in tqdm(image_ids): + if args.dataset_name == "ade20k_sem_seg_val": + gt_dir = os.path.join(_root, "ADEChallengeData2016", "annotations_detectron2", "validation") + segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64) + elif args.dataset_name == "coco_2017_test_stuff_10k_sem_seg": + gt_dir = os.path.join(_root, "coco", "coco_stuff_10k", "annotations_detectron2", "test") + segm_gt = read_image(os.path.join(gt_dir, image_id + ".png")).copy().astype(np.int64) + elif args.dataset_name == "ade20k_full_sem_seg_val": + gt_dir = os.path.join(_root, "ADE20K_2021_17_01", "annotations_detectron2", "validation") + segm_gt = read_image(os.path.join(gt_dir, image_id + ".tif")).copy().astype(np.int64) + else: + raise ValueError(f"Unsupported dataset {args.dataset_name}") + + # get predictions + segm_dt = np.zeros_like(segm_gt) + anns = imgToAnns[image_id] + for ann in anns: + # map back category_id + if hasattr(meta, "stuff_dataset_id_to_contiguous_id"): + if ann["category_id"] in meta.stuff_dataset_id_to_contiguous_id: + category_id = meta.stuff_dataset_id_to_contiguous_id[ann["category_id"]] + else: + category_id = ann["category_id"] + mask = maskUtils.decode(ann["segmentation"]) + segm_dt[mask > 0] = category_id + + # miou + gt = segm_gt.copy() + pred = segm_dt.copy() + gt[gt == ignore_label] = num_classes + conf_matrix += np.bincount( + (num_classes + 1) * pred.reshape(-1) + gt.reshape(-1), + minlength=conf_matrix.size, + ).reshape(conf_matrix.shape) + + # pq + pq_stat_single = pq_compute_single_image(segm_gt, segm_dt, categories, meta.ignore_label) + pq_stat += pq_stat_single + + metrics = [("All", None), ("Stuff", False)] + results = {} + for name, isthing in metrics: + results[name], per_class_results = pq_stat.pq_average(categories, isthing=isthing) + if name == 'All': + results['per_class'] = per_class_results + print("{:10s}| {:>5s} {:>5s} {:>5s} {:>5s}".format("", "PQ", "SQ", "RQ", "N")) + print("-" * (10 + 7 * 4)) + + for name, _isthing in metrics: + print("{:10s}| {:5.1f} {:5.1f} {:5.1f} {:5d}".format( + name, + 100 * results[name]['pq'], + 100 * results[name]['sq'], + 100 * results[name]['rq'], + results[name]['n']) + ) + + # calculate miou + acc = np.full(num_classes, np.nan, dtype=np.float64) + iou = np.full(num_classes, np.nan, dtype=np.float64) + tp = conf_matrix.diagonal()[:-1].astype(np.float64) + pos_gt = np.sum(conf_matrix[:-1, :-1], axis=0).astype(np.float64) + pos_pred = np.sum(conf_matrix[:-1, :-1], axis=1).astype(np.float64) + acc_valid = pos_gt > 0 + acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] + iou_valid = (pos_gt + pos_pred) > 0 + union = pos_gt + pos_pred - tp + iou[acc_valid] = tp[acc_valid] / union[acc_valid] + miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) + + print("") + print(f"mIoU: {miou}") + + +if __name__ == '__main__': + main()