diff --git a/models/official/mask_rcnn/.idea/mask_rcnn.iml b/models/official/mask_rcnn/.idea/mask_rcnn.iml
new file mode 100644
index 000000000..6f63a63cc
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/mask_rcnn.iml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/misc.xml b/models/official/mask_rcnn/.idea/misc.xml
new file mode 100644
index 000000000..399908725
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/modules.xml b/models/official/mask_rcnn/.idea/modules.xml
new file mode 100644
index 000000000..958919a4a
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/vcs.xml b/models/official/mask_rcnn/.idea/vcs.xml
new file mode 100644
index 000000000..c2365ab11
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/workspace.xml b/models/official/mask_rcnn/.idea/workspace.xml
new file mode 100644
index 000000000..f8e5a738a
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/workspace.xml
@@ -0,0 +1,358 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1597650638652
+
+
+ 1597650638652
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/models/official/mask_rcnn/coco_metric.py b/models/official/mask_rcnn/coco_metric.py
index ea00a3255..47645fbdc 100644
--- a/models/official/mask_rcnn/coco_metric.py
+++ b/models/official/mask_rcnn/coco_metric.py
@@ -134,7 +134,8 @@ def load_predictions(self,
for i, image_id in enumerate(detection_results['source_id']):
if include_mask:
- box_coorindates_in_image = detection_results['detection_boxes'][i]
+ # box_coorindates_in_image = detection_results['detection_boxes'][i]
+ box_coorindates_in_image = detection_results['selected_box_rois'][i]
segments = generate_segmentation_from_masks(
detection_results['detection_masks'][i],
box_coorindates_in_image,
diff --git a/models/official/mask_rcnn/coco_metric_back.py b/models/official/mask_rcnn/coco_metric_back.py
new file mode 100644
index 000000000..ea00a3255
--- /dev/null
+++ b/models/official/mask_rcnn/coco_metric_back.py
@@ -0,0 +1,339 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import atexit
+import copy
+import tempfile
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as maskUtils
+import tensorflow.compat.v1 as tf
+import cv2
+
+
+class MaskCOCO(COCO):
+ """COCO object for mask evaluation.
+ """
+
+ def reset(self, dataset):
+ """Reset the dataset and groundtruth data index in this object.
+
+ Args:
+ dataset: dict of groundtruth data. It should has similar structure as the
+ COCO groundtruth JSON file. Must contains three keys: {'images',
+ 'annotations', 'categories'}.
+ 'images': list of image information dictionary. Required keys: 'id',
+ 'width' and 'height'.
+ 'annotations': list of dict. Bounding boxes and segmentations related
+ information. Required keys: {'id', 'image_id', 'category_id', 'bbox',
+ 'iscrowd', 'area', 'segmentation'}.
+ 'categories': list of dict of the category information.
+ Required key: 'id'.
+ Refer to http://cocodataset.org/#format-data for more details.
+
+ Raises:
+ AttributeError: If the dataset is empty or not a dict.
+ """
+ assert dataset, 'Groundtruth should not be empty.'
+ assert isinstance(dataset,
+ dict), 'annotation file format {} not supported'.format(
+ type(dataset))
+ self.anns, self.cats, self.imgs = dict(), dict(), dict()
+ self.dataset = copy.deepcopy(dataset)
+ self.createIndex()
+
+ def loadRes(self, detection_results, include_mask, is_image_mask=False):
+ """Load result file and return a result api object.
+
+ Args:
+ detection_results: a dictionary containing predictions results.
+ include_mask: a boolean, whether to include mask in detection results.
+ is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+ Returns:
+ res: result MaskCOCO api object
+ """
+ res = MaskCOCO()
+ res.dataset['images'] = [img for img in self.dataset['images']]
+ print('Loading and preparing results...')
+ predictions = self.load_predictions(
+ detection_results,
+ include_mask=include_mask,
+ is_image_mask=is_image_mask)
+ assert isinstance(predictions, list), 'results in not an array of objects'
+ if predictions:
+ image_ids = [pred['image_id'] for pred in predictions]
+ assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \
+ 'Results do not correspond to current coco set'
+
+ if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']):
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+ for idx, pred in enumerate(predictions):
+ bb = pred['bbox']
+ x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+ if 'segmentation' not in pred:
+ pred['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+ pred['area'] = bb[2] * bb[3]
+ pred['id'] = idx + 1
+ pred['iscrowd'] = 0
+ elif 'segmentation' in predictions[0]:
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+ for idx, pred in enumerate(predictions):
+ # now only support compressed RLE format as segmentation results
+ pred['area'] = maskUtils.area(pred['segmentation'])
+ if 'bbox' not in pred:
+ pred['bbox'] = maskUtils.toBbox(pred['segmentation'])
+ pred['id'] = idx + 1
+ pred['iscrowd'] = 0
+
+ res.dataset['annotations'] = predictions
+
+ res.createIndex()
+ return res
+
+ def load_predictions(self,
+ detection_results,
+ include_mask,
+ is_image_mask=False):
+ """Create prediction dictionary list from detection and mask results.
+
+ Args:
+ detection_results: a dictionary containing numpy arrays which corresponds
+ to prediction results.
+ include_mask: a boolean, whether to include mask in detection results.
+ is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+ Returns:
+ a list of dictionary including different prediction results from the model
+ in numpy form.
+ """
+ predictions = []
+ num_detections = detection_results['detection_scores'].size
+ current_index = 0
+ for i, image_id in enumerate(detection_results['source_id']):
+
+ if include_mask:
+ box_coorindates_in_image = detection_results['detection_boxes'][i]
+ segments = generate_segmentation_from_masks(
+ detection_results['detection_masks'][i],
+ box_coorindates_in_image,
+ int(detection_results['image_info'][i][3]),
+ int(detection_results['image_info'][i][4]),
+ is_image_mask=is_image_mask)
+
+ # Convert the mask to uint8 and then to fortranarray for RLE encoder.
+ encoded_masks = [
+ maskUtils.encode(np.asfortranarray(instance_mask.astype(np.uint8)))
+ for instance_mask in segments
+ ]
+
+ for box_index in range(int(detection_results['num_detections'][i])):
+ if current_index % 1000000 == 0:
+ print('{}/{}'.format(current_index, num_detections))
+ current_index += 1
+
+ prediction = {
+ 'image_id': int(image_id),
+ 'bbox': detection_results['detection_boxes'][i][box_index].tolist(),
+ 'score': detection_results['detection_scores'][i][box_index],
+ 'category_id': int(
+ detection_results['detection_classes'][i][box_index]),
+ }
+
+ if include_mask:
+ prediction['segmentation'] = encoded_masks[box_index]
+
+ predictions.append(prediction)
+
+ return predictions
+
+
+def generate_segmentation_from_masks(masks,
+ detected_boxes,
+ image_height,
+ image_width,
+ is_image_mask=False):
+ """Generates segmentation result from instance masks.
+
+ Args:
+ masks: a numpy array of shape [N, mask_height, mask_width] representing the
+ instance masks w.r.t. the `detected_boxes`.
+ detected_boxes: a numpy array of shape [N, 4] representing the reference
+ bounding boxes.
+ image_height: an integer representing the height of the image.
+ image_width: an integer representing the width of the image.
+ is_image_mask: bool. True: input masks are whole-image masks. False: input
+ masks are bounding-box level masks.
+
+ Returns:
+ segms: a numpy array of shape [N, image_height, image_width] representing
+ the instance masks *pasted* on the image canvas.
+ """
+
+ def expand_boxes(boxes, scale):
+ """Expands an array of boxes by a given scale."""
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
+ # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+ # whereas `boxes` here is in [x1, y1, w, h] form
+ w_half = boxes[:, 2] * .5
+ h_half = boxes[:, 3] * .5
+ x_c = boxes[:, 0] + w_half
+ y_c = boxes[:, 1] + h_half
+
+ w_half *= scale
+ h_half *= scale
+
+ boxes_exp = np.zeros(boxes.shape)
+ boxes_exp[:, 0] = x_c - w_half
+ boxes_exp[:, 2] = x_c + w_half
+ boxes_exp[:, 1] = y_c - h_half
+ boxes_exp[:, 3] = y_c + h_half
+
+ return boxes_exp
+
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
+ # To work around an issue with cv2.resize (it seems to automatically pad
+ # with repeated border values), we manually zero-pad the masks by 1 pixel
+ # prior to resizing back to the original image resolution. This prevents
+ # "top hat" artifacts. We therefore need to expand the reference boxes by an
+ # appropriate factor.
+ _, mask_height, mask_width = masks.shape
+ scale = max((mask_width + 2.0) / mask_width,
+ (mask_height + 2.0) / mask_height)
+
+ ref_boxes = expand_boxes(detected_boxes, scale)
+ ref_boxes = ref_boxes.astype(np.int32)
+ padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+ segms = []
+ for mask_ind, mask in enumerate(masks):
+ im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+ if is_image_mask:
+ # Process whole-image masks.
+ im_mask[:, :] = mask[:, :]
+ else:
+ # Process mask inside bounding boxes.
+ padded_mask[1:-1, 1:-1] = mask[:, :]
+
+ ref_box = ref_boxes[mask_ind, :]
+ w = ref_box[2] - ref_box[0] + 1
+ h = ref_box[3] - ref_box[1] + 1
+ w = np.maximum(w, 1)
+ h = np.maximum(h, 1)
+
+ mask = cv2.resize(padded_mask, (w, h))
+ mask = np.array(mask > 0.5, dtype=np.uint8)
+
+ x_0 = max(ref_box[0], 0)
+ x_1 = min(ref_box[2] + 1, image_width)
+ y_0 = max(ref_box[1], 0)
+ y_1 = min(ref_box[3] + 1, image_height)
+
+ im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (
+ x_0 - ref_box[0]):(x_1 - ref_box[0])]
+ segms.append(im_mask)
+
+ segms = np.array(segms)
+ assert masks.shape[0] == segms.shape[0]
+ return segms
+
+
+class EvaluationMetric(object):
+ """COCO evaluation metric class."""
+
+ def __init__(self, filename, include_mask):
+ """Constructs COCO evaluation class.
+
+ The class provides the interface to metrics_fn in TPUEstimator. The
+ _evaluate() loads a JSON file in COCO annotation format as the
+ groundtruths and runs COCO evaluation.
+
+ Args:
+ filename: Ground truth JSON file name. If filename is None, use
+ groundtruth data passed from the dataloader for evaluation.
+ include_mask: boolean to indicate whether or not to include mask eval.
+ """
+ if filename:
+ if filename.startswith('gs://'):
+ _, local_val_json = tempfile.mkstemp(suffix='.json')
+ tf.gfile.Remove(local_val_json)
+
+ tf.gfile.Copy(filename, local_val_json)
+ atexit.register(tf.gfile.Remove, local_val_json)
+ else:
+ local_val_json = filename
+ self.coco_gt = MaskCOCO(local_val_json)
+ self.filename = filename
+ self.metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+ 'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+ self._include_mask = include_mask
+ if self._include_mask:
+ mask_metric_names = ['mask_' + x for x in self.metric_names]
+ self.metric_names.extend(mask_metric_names)
+
+ self._reset()
+
+ def _reset(self):
+ """Reset COCO API object."""
+ if self.filename is None and not hasattr(self, 'coco_gt'):
+ self.coco_gt = MaskCOCO()
+
+ def predict_metric_fn(self,
+ predictions,
+ is_predict_image_mask=False,
+ groundtruth_data=None):
+ """Generates COCO metrics."""
+ image_ids = list(set(predictions['source_id']))
+ if groundtruth_data is not None:
+ self.coco_gt.reset(groundtruth_data)
+ coco_dt = self.coco_gt.loadRes(
+ predictions, self._include_mask, is_image_mask=is_predict_image_mask)
+ coco_eval = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
+ coco_eval.params.imgIds = image_ids
+ coco_eval.evaluate()
+ coco_eval.accumulate()
+ coco_eval.summarize()
+ coco_metrics = coco_eval.stats
+
+ if self._include_mask:
+ # Create another object for instance segmentation metric evaluation.
+ mcoco_eval = COCOeval(self.coco_gt, coco_dt, iouType='segm')
+ mcoco_eval.params.imgIds = image_ids
+ mcoco_eval.evaluate()
+ mcoco_eval.accumulate()
+ mcoco_eval.summarize()
+ mask_coco_metrics = mcoco_eval.stats
+
+ if self._include_mask:
+ metrics = np.hstack((coco_metrics, mask_coco_metrics))
+ else:
+ metrics = coco_metrics
+
+ # clean up after evaluation is done.
+ self._reset()
+ metrics = metrics.astype(np.float32)
+
+ metrics_dict = {}
+ for i, name in enumerate(self.metric_names):
+ metrics_dict[name] = metrics[i]
+ return metrics_dict
diff --git a/models/official/mask_rcnn/coco_metric_new.py b/models/official/mask_rcnn/coco_metric_new.py
new file mode 100644
index 000000000..47645fbdc
--- /dev/null
+++ b/models/official/mask_rcnn/coco_metric_new.py
@@ -0,0 +1,340 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import atexit
+import copy
+import tempfile
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as maskUtils
+import tensorflow.compat.v1 as tf
+import cv2
+
+
+class MaskCOCO(COCO):
+ """COCO object for mask evaluation.
+ """
+
+ def reset(self, dataset):
+ """Reset the dataset and groundtruth data index in this object.
+
+ Args:
+ dataset: dict of groundtruth data. It should has similar structure as the
+ COCO groundtruth JSON file. Must contains three keys: {'images',
+ 'annotations', 'categories'}.
+ 'images': list of image information dictionary. Required keys: 'id',
+ 'width' and 'height'.
+ 'annotations': list of dict. Bounding boxes and segmentations related
+ information. Required keys: {'id', 'image_id', 'category_id', 'bbox',
+ 'iscrowd', 'area', 'segmentation'}.
+ 'categories': list of dict of the category information.
+ Required key: 'id'.
+ Refer to http://cocodataset.org/#format-data for more details.
+
+ Raises:
+ AttributeError: If the dataset is empty or not a dict.
+ """
+ assert dataset, 'Groundtruth should not be empty.'
+ assert isinstance(dataset,
+ dict), 'annotation file format {} not supported'.format(
+ type(dataset))
+ self.anns, self.cats, self.imgs = dict(), dict(), dict()
+ self.dataset = copy.deepcopy(dataset)
+ self.createIndex()
+
+ def loadRes(self, detection_results, include_mask, is_image_mask=False):
+ """Load result file and return a result api object.
+
+ Args:
+ detection_results: a dictionary containing predictions results.
+ include_mask: a boolean, whether to include mask in detection results.
+ is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+ Returns:
+ res: result MaskCOCO api object
+ """
+ res = MaskCOCO()
+ res.dataset['images'] = [img for img in self.dataset['images']]
+ print('Loading and preparing results...')
+ predictions = self.load_predictions(
+ detection_results,
+ include_mask=include_mask,
+ is_image_mask=is_image_mask)
+ assert isinstance(predictions, list), 'results in not an array of objects'
+ if predictions:
+ image_ids = [pred['image_id'] for pred in predictions]
+ assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \
+ 'Results do not correspond to current coco set'
+
+ if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']):
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+ for idx, pred in enumerate(predictions):
+ bb = pred['bbox']
+ x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+ if 'segmentation' not in pred:
+ pred['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+ pred['area'] = bb[2] * bb[3]
+ pred['id'] = idx + 1
+ pred['iscrowd'] = 0
+ elif 'segmentation' in predictions[0]:
+ res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+ for idx, pred in enumerate(predictions):
+ # now only support compressed RLE format as segmentation results
+ pred['area'] = maskUtils.area(pred['segmentation'])
+ if 'bbox' not in pred:
+ pred['bbox'] = maskUtils.toBbox(pred['segmentation'])
+ pred['id'] = idx + 1
+ pred['iscrowd'] = 0
+
+ res.dataset['annotations'] = predictions
+
+ res.createIndex()
+ return res
+
+ def load_predictions(self,
+ detection_results,
+ include_mask,
+ is_image_mask=False):
+ """Create prediction dictionary list from detection and mask results.
+
+ Args:
+ detection_results: a dictionary containing numpy arrays which corresponds
+ to prediction results.
+ include_mask: a boolean, whether to include mask in detection results.
+ is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+ Returns:
+ a list of dictionary including different prediction results from the model
+ in numpy form.
+ """
+ predictions = []
+ num_detections = detection_results['detection_scores'].size
+ current_index = 0
+ for i, image_id in enumerate(detection_results['source_id']):
+
+ if include_mask:
+ # box_coorindates_in_image = detection_results['detection_boxes'][i]
+ box_coorindates_in_image = detection_results['selected_box_rois'][i]
+ segments = generate_segmentation_from_masks(
+ detection_results['detection_masks'][i],
+ box_coorindates_in_image,
+ int(detection_results['image_info'][i][3]),
+ int(detection_results['image_info'][i][4]),
+ is_image_mask=is_image_mask)
+
+ # Convert the mask to uint8 and then to fortranarray for RLE encoder.
+ encoded_masks = [
+ maskUtils.encode(np.asfortranarray(instance_mask.astype(np.uint8)))
+ for instance_mask in segments
+ ]
+
+ for box_index in range(int(detection_results['num_detections'][i])):
+ if current_index % 1000000 == 0:
+ print('{}/{}'.format(current_index, num_detections))
+ current_index += 1
+
+ prediction = {
+ 'image_id': int(image_id),
+ 'bbox': detection_results['detection_boxes'][i][box_index].tolist(),
+ 'score': detection_results['detection_scores'][i][box_index],
+ 'category_id': int(
+ detection_results['detection_classes'][i][box_index]),
+ }
+
+ if include_mask:
+ prediction['segmentation'] = encoded_masks[box_index]
+
+ predictions.append(prediction)
+
+ return predictions
+
+
+def generate_segmentation_from_masks(masks,
+ detected_boxes,
+ image_height,
+ image_width,
+ is_image_mask=False):
+ """Generates segmentation result from instance masks.
+
+ Args:
+ masks: a numpy array of shape [N, mask_height, mask_width] representing the
+ instance masks w.r.t. the `detected_boxes`.
+ detected_boxes: a numpy array of shape [N, 4] representing the reference
+ bounding boxes.
+ image_height: an integer representing the height of the image.
+ image_width: an integer representing the width of the image.
+ is_image_mask: bool. True: input masks are whole-image masks. False: input
+ masks are bounding-box level masks.
+
+ Returns:
+ segms: a numpy array of shape [N, image_height, image_width] representing
+ the instance masks *pasted* on the image canvas.
+ """
+
+ def expand_boxes(boxes, scale):
+ """Expands an array of boxes by a given scale."""
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227 # pylint: disable=line-too-long
+ # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+ # whereas `boxes` here is in [x1, y1, w, h] form
+ w_half = boxes[:, 2] * .5
+ h_half = boxes[:, 3] * .5
+ x_c = boxes[:, 0] + w_half
+ y_c = boxes[:, 1] + h_half
+
+ w_half *= scale
+ h_half *= scale
+
+ boxes_exp = np.zeros(boxes.shape)
+ boxes_exp[:, 0] = x_c - w_half
+ boxes_exp[:, 2] = x_c + w_half
+ boxes_exp[:, 1] = y_c - h_half
+ boxes_exp[:, 3] = y_c + h_half
+
+ return boxes_exp
+
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812 # pylint: disable=line-too-long
+ # To work around an issue with cv2.resize (it seems to automatically pad
+ # with repeated border values), we manually zero-pad the masks by 1 pixel
+ # prior to resizing back to the original image resolution. This prevents
+ # "top hat" artifacts. We therefore need to expand the reference boxes by an
+ # appropriate factor.
+ _, mask_height, mask_width = masks.shape
+ scale = max((mask_width + 2.0) / mask_width,
+ (mask_height + 2.0) / mask_height)
+
+ ref_boxes = expand_boxes(detected_boxes, scale)
+ ref_boxes = ref_boxes.astype(np.int32)
+ padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+ segms = []
+ for mask_ind, mask in enumerate(masks):
+ im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+ if is_image_mask:
+ # Process whole-image masks.
+ im_mask[:, :] = mask[:, :]
+ else:
+ # Process mask inside bounding boxes.
+ padded_mask[1:-1, 1:-1] = mask[:, :]
+
+ ref_box = ref_boxes[mask_ind, :]
+ w = ref_box[2] - ref_box[0] + 1
+ h = ref_box[3] - ref_box[1] + 1
+ w = np.maximum(w, 1)
+ h = np.maximum(h, 1)
+
+ mask = cv2.resize(padded_mask, (w, h))
+ mask = np.array(mask > 0.5, dtype=np.uint8)
+
+ x_0 = max(ref_box[0], 0)
+ x_1 = min(ref_box[2] + 1, image_width)
+ y_0 = max(ref_box[1], 0)
+ y_1 = min(ref_box[3] + 1, image_height)
+
+ im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (
+ x_0 - ref_box[0]):(x_1 - ref_box[0])]
+ segms.append(im_mask)
+
+ segms = np.array(segms)
+ assert masks.shape[0] == segms.shape[0]
+ return segms
+
+
+class EvaluationMetric(object):
+ """COCO evaluation metric class."""
+
+ def __init__(self, filename, include_mask):
+ """Constructs COCO evaluation class.
+
+ The class provides the interface to metrics_fn in TPUEstimator. The
+ _evaluate() loads a JSON file in COCO annotation format as the
+ groundtruths and runs COCO evaluation.
+
+ Args:
+ filename: Ground truth JSON file name. If filename is None, use
+ groundtruth data passed from the dataloader for evaluation.
+ include_mask: boolean to indicate whether or not to include mask eval.
+ """
+ if filename:
+ if filename.startswith('gs://'):
+ _, local_val_json = tempfile.mkstemp(suffix='.json')
+ tf.gfile.Remove(local_val_json)
+
+ tf.gfile.Copy(filename, local_val_json)
+ atexit.register(tf.gfile.Remove, local_val_json)
+ else:
+ local_val_json = filename
+ self.coco_gt = MaskCOCO(local_val_json)
+ self.filename = filename
+ self.metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+ 'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+ self._include_mask = include_mask
+ if self._include_mask:
+ mask_metric_names = ['mask_' + x for x in self.metric_names]
+ self.metric_names.extend(mask_metric_names)
+
+ self._reset()
+
+ def _reset(self):
+ """Reset COCO API object."""
+ if self.filename is None and not hasattr(self, 'coco_gt'):
+ self.coco_gt = MaskCOCO()
+
+ def predict_metric_fn(self,
+ predictions,
+ is_predict_image_mask=False,
+ groundtruth_data=None):
+ """Generates COCO metrics."""
+ image_ids = list(set(predictions['source_id']))
+ if groundtruth_data is not None:
+ self.coco_gt.reset(groundtruth_data)
+ coco_dt = self.coco_gt.loadRes(
+ predictions, self._include_mask, is_image_mask=is_predict_image_mask)
+ coco_eval = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
+ coco_eval.params.imgIds = image_ids
+ coco_eval.evaluate()
+ coco_eval.accumulate()
+ coco_eval.summarize()
+ coco_metrics = coco_eval.stats
+
+ if self._include_mask:
+ # Create another object for instance segmentation metric evaluation.
+ mcoco_eval = COCOeval(self.coco_gt, coco_dt, iouType='segm')
+ mcoco_eval.params.imgIds = image_ids
+ mcoco_eval.evaluate()
+ mcoco_eval.accumulate()
+ mcoco_eval.summarize()
+ mask_coco_metrics = mcoco_eval.stats
+
+ if self._include_mask:
+ metrics = np.hstack((coco_metrics, mask_coco_metrics))
+ else:
+ metrics = coco_metrics
+
+ # clean up after evaluation is done.
+ self._reset()
+ metrics = metrics.astype(np.float32)
+
+ metrics_dict = {}
+ for i, name in enumerate(self.metric_names):
+ metrics_dict[name] = metrics[i]
+ return metrics_dict
diff --git a/models/official/mask_rcnn/evaluation.py b/models/official/mask_rcnn/evaluation.py
index 09a971da3..608b41e5d 100644
--- a/models/official/mask_rcnn/evaluation.py
+++ b/models/official/mask_rcnn/evaluation.py
@@ -45,6 +45,21 @@ def process_prediction_for_eval(prediction):
new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
processed_box_coordinates[image_id, box_id, :] = new_box
prediction['detection_boxes'] = processed_box_coordinates
+
+ # +++
+ box_coordinates_rois = prediction['selected_box_rois']
+ box_coordinates_rois_coordinates = np.zeros_like(box_coordinates_rois)
+
+ for image_id in range(box_coordinates_rois.shape[0]):
+ scale = image_info[image_id][2]
+ for box_id in range(box_coordinates_rois.shape[1]):
+ # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+ # by image scale.
+ y1, x1, y2, x2 = box_coordinates_rois[image_id, box_id, :]
+ new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+ box_coordinates_rois_coordinates[image_id, box_id, :] = new_box
+ prediction['selected_box_rois'] = box_coordinates_rois_coordinates
+
return prediction
@@ -208,6 +223,7 @@ def write_image_summary(predictions, summary_writer, current_step):
len(predictions['detection_boxes'][i]),
int(predictions['num_detections'][i]))
detection_boxes = predictions['detection_boxes'][i][:num_detections]
+ selected_box_rois = predictions['selected_box_rois'][i][:num_detections]
detection_scores = predictions['detection_scores'][i][:num_detections]
detection_classes = predictions['detection_classes'][i][:num_detections]
@@ -221,6 +237,10 @@ def write_image_summary(predictions, summary_writer, current_step):
detection_boxes = detection_boxes * np.array(
[image_width, image_height, image_width, image_height])
+ selected_box_rois = selected_box_rois / np.array([w, h, w, h])
+ selected_box_rois = selected_box_rois * np.array(
+ [image_width, image_height, image_width, image_height])
+
gt_boxes = None
if 'groundtruth_boxes' in predictions:
gt_boxes = predictions['groundtruth_boxes'][i]
@@ -231,7 +251,7 @@ def write_image_summary(predictions, summary_writer, current_step):
if include_mask:
instance_masks = predictions['detection_masks'][i][0:num_detections]
segmentations = coco_metric.generate_segmentation_from_masks(
- instance_masks, detection_boxes, image_height, image_width)
+ instance_masks, selected_box_rois, image_height, image_width)
# From [x, y, w, h] to [x1, y1, x2, y2] and
# process_prediction_for_eval() set the box to be [x, y] format, need to
diff --git a/models/official/mask_rcnn/evaluation_back.py b/models/official/mask_rcnn/evaluation_back.py
new file mode 100644
index 000000000..09a971da3
--- /dev/null
+++ b/models/official/mask_rcnn/evaluation_back.py
@@ -0,0 +1,253 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to perform COCO evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+from absl import logging
+import numpy as np
+from PIL import Image
+import six
+import tensorflow.compat.v1 as tf
+
+import coco_metric
+import coco_utils
+from object_detection import visualization_utils
+
+
+def process_prediction_for_eval(prediction):
+ """Process the model prediction for COCO eval."""
+ image_info = prediction['image_info']
+ box_coordinates = prediction['detection_boxes']
+ processed_box_coordinates = np.zeros_like(box_coordinates)
+
+ for image_id in range(box_coordinates.shape[0]):
+ scale = image_info[image_id][2]
+ for box_id in range(box_coordinates.shape[1]):
+ # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+ # by image scale.
+ y1, x1, y2, x2 = box_coordinates[image_id, box_id, :]
+ new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+ processed_box_coordinates[image_id, box_id, :] = new_box
+ prediction['detection_boxes'] = processed_box_coordinates
+ return prediction
+
+
+def compute_coco_eval_metric(predictor,
+ num_batches=-1,
+ include_mask=True,
+ annotation_json_file=None):
+ """Compute COCO eval metric given a prediction generator.
+
+ Args:
+ predictor: a generator that iteratively pops a dictionary of predictions
+ with the format compatible with COCO eval tool.
+ num_batches: the number of batches to be aggregated in eval. This is how
+ many times that the predictor gets pulled.
+ include_mask: a boolean that indicates whether we include the mask eval.
+ annotation_json_file: the annotation json file of the eval dataset.
+
+ Returns:
+ eval_results: the aggregated COCO metric eval results.
+ """
+ del num_batches
+
+ if not annotation_json_file:
+ annotation_json_file = None
+ use_groundtruth_from_json = (annotation_json_file is not None)
+
+ batch_idx = 0
+ predictions = dict()
+ while True:
+ try:
+ prediction = six.next(predictor)
+ logging.info('Running inference on batch %d...', (batch_idx + 1))
+ except StopIteration:
+ logging.info('Finished the eval set at %d batch.', (batch_idx + 1))
+ break
+
+ prediction = process_prediction_for_eval(prediction)
+ for k, v in six.iteritems(prediction):
+ if k not in predictions:
+ predictions[k] = [v]
+ else:
+ predictions[k].append(v)
+
+ batch_idx = batch_idx + 1
+
+ for k, v in six.iteritems(predictions):
+ predictions[k] = np.concatenate(predictions[k], axis=0)
+
+ if 'orig_images' in predictions and predictions['orig_images'].shape[0] > 10:
+ # Only samples a few images for visualization.
+ predictions['orig_images'] = predictions['orig_images'][:10]
+
+ if use_groundtruth_from_json:
+ eval_metric = coco_metric.EvaluationMetric(
+ annotation_json_file, include_mask=include_mask)
+ eval_results = eval_metric.predict_metric_fn(predictions)
+ else:
+ images, annotations = coco_utils.extract_coco_groundtruth(
+ predictions, include_mask)
+ dataset = coco_utils.create_coco_format_dataset(images, annotations)
+ eval_metric = coco_metric.EvaluationMetric(
+ filename=None, include_mask=include_mask)
+ eval_results = eval_metric.predict_metric_fn(
+ predictions, groundtruth_data=dataset)
+ logging.info('Eval results: %s', eval_results)
+ return eval_results, predictions
+
+
+def evaluate(eval_estimator,
+ input_fn,
+ num_eval_samples,
+ eval_batch_size,
+ include_mask=True,
+ validation_json_file=None):
+ """Runs COCO evaluation once."""
+ predictor = eval_estimator.predict(
+ input_fn=input_fn, yield_single_examples=False)
+ # Every predictor.next() gets a batch of prediction (a dictionary).
+ num_eval_times = num_eval_samples // eval_batch_size
+ assert num_eval_times > 0, 'num_eval_samples >= eval_batch_size!'
+ eval_results, predictions = compute_coco_eval_metric(predictor,
+ num_eval_times,
+ include_mask,
+ validation_json_file)
+ return eval_results, predictions
+
+
+def write_summary(eval_results, summary_writer, current_step, predictions=None):
+ """Write out eval results for the checkpoint."""
+ with tf.Graph().as_default():
+ summaries = []
+ for metric in eval_results:
+ summaries.append(
+ tf.Summary.Value(tag=metric, simple_value=eval_results[metric]))
+ tf_summary = tf.Summary(value=list(summaries))
+ summary_writer.add_summary(tf_summary, current_step)
+ write_image_summary(predictions, summary_writer, current_step)
+
+
+def create_image_summary(image,
+ boxes,
+ scores,
+ classes,
+ gt_boxes=None,
+ segmentations=None):
+ """Creates an image summary given predictions."""
+ max_boxes_to_draw = 100
+ min_score_thresh = 0.1
+
+ # Visualizes the predicitons.
+ image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+ image,
+ boxes,
+ classes=classes,
+ scores=scores,
+ category_index={},
+ instance_masks=segmentations,
+ use_normalized_coordinates=False,
+ max_boxes_to_draw=max_boxes_to_draw,
+ min_score_thresh=min_score_thresh,
+ agnostic_mode=False)
+ if gt_boxes is not None:
+ # Visualizes the groundtruth boxes. They are in black by default.
+ image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+ image_with_detections,
+ gt_boxes,
+ classes=None,
+ scores=None,
+ category_index={},
+ use_normalized_coordinates=False,
+ max_boxes_to_draw=max_boxes_to_draw,
+ agnostic_mode=True)
+ buf = io.BytesIO()
+ w, h = image_with_detections.shape[:2]
+ ratio = 1024 / w
+ new_size = [int(w * ratio), int(h * ratio)]
+ image = Image.fromarray(image_with_detections.astype(np.uint8))
+ image.thumbnail(new_size)
+ image.save(buf, format='png')
+ image_summary = tf.Summary.Image(encoded_image_string=buf.getvalue())
+ return image_summary
+
+
+def write_image_summary(predictions, summary_writer, current_step):
+ """Write out image and prediction for summary."""
+ if not predictions or not isinstance(predictions, dict):
+ return
+ if 'orig_images' not in predictions:
+ logging.info('Missing orig_images in predictions: %s',
+ predictions.keys())
+ return
+ predictions['orig_images'] = predictions['orig_images'] * 255
+ predictions['orig_images'] = predictions['orig_images'].astype(np.uint8)
+ num_images = predictions['orig_images'].shape[0]
+ include_mask = ('detection_masks' in predictions)
+
+ with tf.Graph().as_default():
+ summaries = []
+ for i in xrange(num_images):
+ num_detections = min(
+ len(predictions['detection_boxes'][i]),
+ int(predictions['num_detections'][i]))
+ detection_boxes = predictions['detection_boxes'][i][:num_detections]
+ detection_scores = predictions['detection_scores'][i][:num_detections]
+ detection_classes = predictions['detection_classes'][i][:num_detections]
+
+ image = predictions['orig_images'][i]
+ image_height = image.shape[0]
+ image_width = image.shape[1]
+
+ # Rescale the box to fit the visualization image.
+ h, w = predictions['image_info'][i][3:5]
+ detection_boxes = detection_boxes / np.array([w, h, w, h])
+ detection_boxes = detection_boxes * np.array(
+ [image_width, image_height, image_width, image_height])
+
+ gt_boxes = None
+ if 'groundtruth_boxes' in predictions:
+ gt_boxes = predictions['groundtruth_boxes'][i]
+ gt_boxes = gt_boxes * np.array(
+ [image_height, image_width, image_height, image_width])
+
+ segmentations = None
+ if include_mask:
+ instance_masks = predictions['detection_masks'][i][0:num_detections]
+ segmentations = coco_metric.generate_segmentation_from_masks(
+ instance_masks, detection_boxes, image_height, image_width)
+
+ # From [x, y, w, h] to [x1, y1, x2, y2] and
+ # process_prediction_for_eval() set the box to be [x, y] format, need to
+ # reverted them to [y, x] format.
+ xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1)
+ xmax = xmin + w
+ ymax = ymin + h
+ boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1)
+ image_summary = create_image_summary(
+ image,
+ boxes=boxes_to_visualize,
+ scores=detection_scores,
+ classes=detection_classes.astype(np.int32),
+ gt_boxes=gt_boxes,
+ segmentations=segmentations)
+ image_value = tf.Summary.Value(tag='%d_input' % i, image=image_summary)
+ summaries.append(image_value)
+ tf_summary = tf.Summary(value=list(summaries))
+ summary_writer.add_summary(tf_summary, current_step)
diff --git a/models/official/mask_rcnn/evaluation_new.py b/models/official/mask_rcnn/evaluation_new.py
new file mode 100644
index 000000000..608b41e5d
--- /dev/null
+++ b/models/official/mask_rcnn/evaluation_new.py
@@ -0,0 +1,273 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to perform COCO evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+from absl import logging
+import numpy as np
+from PIL import Image
+import six
+import tensorflow.compat.v1 as tf
+
+import coco_metric
+import coco_utils
+from object_detection import visualization_utils
+
+
+def process_prediction_for_eval(prediction):
+ """Process the model prediction for COCO eval."""
+ image_info = prediction['image_info']
+ box_coordinates = prediction['detection_boxes']
+ processed_box_coordinates = np.zeros_like(box_coordinates)
+
+ for image_id in range(box_coordinates.shape[0]):
+ scale = image_info[image_id][2]
+ for box_id in range(box_coordinates.shape[1]):
+ # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+ # by image scale.
+ y1, x1, y2, x2 = box_coordinates[image_id, box_id, :]
+ new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+ processed_box_coordinates[image_id, box_id, :] = new_box
+ prediction['detection_boxes'] = processed_box_coordinates
+
+ # +++
+ box_coordinates_rois = prediction['selected_box_rois']
+ box_coordinates_rois_coordinates = np.zeros_like(box_coordinates_rois)
+
+ for image_id in range(box_coordinates_rois.shape[0]):
+ scale = image_info[image_id][2]
+ for box_id in range(box_coordinates_rois.shape[1]):
+ # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+ # by image scale.
+ y1, x1, y2, x2 = box_coordinates_rois[image_id, box_id, :]
+ new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+ box_coordinates_rois_coordinates[image_id, box_id, :] = new_box
+ prediction['selected_box_rois'] = box_coordinates_rois_coordinates
+
+ return prediction
+
+
+def compute_coco_eval_metric(predictor,
+ num_batches=-1,
+ include_mask=True,
+ annotation_json_file=None):
+ """Compute COCO eval metric given a prediction generator.
+
+ Args:
+ predictor: a generator that iteratively pops a dictionary of predictions
+ with the format compatible with COCO eval tool.
+ num_batches: the number of batches to be aggregated in eval. This is how
+ many times that the predictor gets pulled.
+ include_mask: a boolean that indicates whether we include the mask eval.
+ annotation_json_file: the annotation json file of the eval dataset.
+
+ Returns:
+ eval_results: the aggregated COCO metric eval results.
+ """
+ del num_batches
+
+ if not annotation_json_file:
+ annotation_json_file = None
+ use_groundtruth_from_json = (annotation_json_file is not None)
+
+ batch_idx = 0
+ predictions = dict()
+ while True:
+ try:
+ prediction = six.next(predictor)
+ logging.info('Running inference on batch %d...', (batch_idx + 1))
+ except StopIteration:
+ logging.info('Finished the eval set at %d batch.', (batch_idx + 1))
+ break
+
+ prediction = process_prediction_for_eval(prediction)
+ for k, v in six.iteritems(prediction):
+ if k not in predictions:
+ predictions[k] = [v]
+ else:
+ predictions[k].append(v)
+
+ batch_idx = batch_idx + 1
+
+ for k, v in six.iteritems(predictions):
+ predictions[k] = np.concatenate(predictions[k], axis=0)
+
+ if 'orig_images' in predictions and predictions['orig_images'].shape[0] > 10:
+ # Only samples a few images for visualization.
+ predictions['orig_images'] = predictions['orig_images'][:10]
+
+ if use_groundtruth_from_json:
+ eval_metric = coco_metric.EvaluationMetric(
+ annotation_json_file, include_mask=include_mask)
+ eval_results = eval_metric.predict_metric_fn(predictions)
+ else:
+ images, annotations = coco_utils.extract_coco_groundtruth(
+ predictions, include_mask)
+ dataset = coco_utils.create_coco_format_dataset(images, annotations)
+ eval_metric = coco_metric.EvaluationMetric(
+ filename=None, include_mask=include_mask)
+ eval_results = eval_metric.predict_metric_fn(
+ predictions, groundtruth_data=dataset)
+ logging.info('Eval results: %s', eval_results)
+ return eval_results, predictions
+
+
+def evaluate(eval_estimator,
+ input_fn,
+ num_eval_samples,
+ eval_batch_size,
+ include_mask=True,
+ validation_json_file=None):
+ """Runs COCO evaluation once."""
+ predictor = eval_estimator.predict(
+ input_fn=input_fn, yield_single_examples=False)
+ # Every predictor.next() gets a batch of prediction (a dictionary).
+ num_eval_times = num_eval_samples // eval_batch_size
+ assert num_eval_times > 0, 'num_eval_samples >= eval_batch_size!'
+ eval_results, predictions = compute_coco_eval_metric(predictor,
+ num_eval_times,
+ include_mask,
+ validation_json_file)
+ return eval_results, predictions
+
+
+def write_summary(eval_results, summary_writer, current_step, predictions=None):
+ """Write out eval results for the checkpoint."""
+ with tf.Graph().as_default():
+ summaries = []
+ for metric in eval_results:
+ summaries.append(
+ tf.Summary.Value(tag=metric, simple_value=eval_results[metric]))
+ tf_summary = tf.Summary(value=list(summaries))
+ summary_writer.add_summary(tf_summary, current_step)
+ write_image_summary(predictions, summary_writer, current_step)
+
+
+def create_image_summary(image,
+ boxes,
+ scores,
+ classes,
+ gt_boxes=None,
+ segmentations=None):
+ """Creates an image summary given predictions."""
+ max_boxes_to_draw = 100
+ min_score_thresh = 0.1
+
+ # Visualizes the predicitons.
+ image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+ image,
+ boxes,
+ classes=classes,
+ scores=scores,
+ category_index={},
+ instance_masks=segmentations,
+ use_normalized_coordinates=False,
+ max_boxes_to_draw=max_boxes_to_draw,
+ min_score_thresh=min_score_thresh,
+ agnostic_mode=False)
+ if gt_boxes is not None:
+ # Visualizes the groundtruth boxes. They are in black by default.
+ image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+ image_with_detections,
+ gt_boxes,
+ classes=None,
+ scores=None,
+ category_index={},
+ use_normalized_coordinates=False,
+ max_boxes_to_draw=max_boxes_to_draw,
+ agnostic_mode=True)
+ buf = io.BytesIO()
+ w, h = image_with_detections.shape[:2]
+ ratio = 1024 / w
+ new_size = [int(w * ratio), int(h * ratio)]
+ image = Image.fromarray(image_with_detections.astype(np.uint8))
+ image.thumbnail(new_size)
+ image.save(buf, format='png')
+ image_summary = tf.Summary.Image(encoded_image_string=buf.getvalue())
+ return image_summary
+
+
+def write_image_summary(predictions, summary_writer, current_step):
+ """Write out image and prediction for summary."""
+ if not predictions or not isinstance(predictions, dict):
+ return
+ if 'orig_images' not in predictions:
+ logging.info('Missing orig_images in predictions: %s',
+ predictions.keys())
+ return
+ predictions['orig_images'] = predictions['orig_images'] * 255
+ predictions['orig_images'] = predictions['orig_images'].astype(np.uint8)
+ num_images = predictions['orig_images'].shape[0]
+ include_mask = ('detection_masks' in predictions)
+
+ with tf.Graph().as_default():
+ summaries = []
+ for i in xrange(num_images):
+ num_detections = min(
+ len(predictions['detection_boxes'][i]),
+ int(predictions['num_detections'][i]))
+ detection_boxes = predictions['detection_boxes'][i][:num_detections]
+ selected_box_rois = predictions['selected_box_rois'][i][:num_detections]
+ detection_scores = predictions['detection_scores'][i][:num_detections]
+ detection_classes = predictions['detection_classes'][i][:num_detections]
+
+ image = predictions['orig_images'][i]
+ image_height = image.shape[0]
+ image_width = image.shape[1]
+
+ # Rescale the box to fit the visualization image.
+ h, w = predictions['image_info'][i][3:5]
+ detection_boxes = detection_boxes / np.array([w, h, w, h])
+ detection_boxes = detection_boxes * np.array(
+ [image_width, image_height, image_width, image_height])
+
+ selected_box_rois = selected_box_rois / np.array([w, h, w, h])
+ selected_box_rois = selected_box_rois * np.array(
+ [image_width, image_height, image_width, image_height])
+
+ gt_boxes = None
+ if 'groundtruth_boxes' in predictions:
+ gt_boxes = predictions['groundtruth_boxes'][i]
+ gt_boxes = gt_boxes * np.array(
+ [image_height, image_width, image_height, image_width])
+
+ segmentations = None
+ if include_mask:
+ instance_masks = predictions['detection_masks'][i][0:num_detections]
+ segmentations = coco_metric.generate_segmentation_from_masks(
+ instance_masks, selected_box_rois, image_height, image_width)
+
+ # From [x, y, w, h] to [x1, y1, x2, y2] and
+ # process_prediction_for_eval() set the box to be [x, y] format, need to
+ # reverted them to [y, x] format.
+ xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1)
+ xmax = xmin + w
+ ymax = ymin + h
+ boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1)
+ image_summary = create_image_summary(
+ image,
+ boxes=boxes_to_visualize,
+ scores=detection_scores,
+ classes=detection_classes.astype(np.int32),
+ gt_boxes=gt_boxes,
+ segmentations=segmentations)
+ image_value = tf.Summary.Value(tag='%d_input' % i, image=image_summary)
+ summaries.append(image_value)
+ tf_summary = tf.Summary(value=list(summaries))
+ summary_writer.add_summary(tf_summary, current_step)
diff --git a/models/official/mask_rcnn/losses.py b/models/official/mask_rcnn/losses.py
index e6815ad7c..99652a75b 100644
--- a/models/official/mask_rcnn/losses.py
+++ b/models/official/mask_rcnn/losses.py
@@ -133,7 +133,30 @@ def _fast_rcnn_box_loss(box_outputs, box_targets, class_targets, normalizer=1.0,
return box_loss
-def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets,
+def gIoU_loss(boxes1, boxes2,weight):
+ boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
+ boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
+ left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
+ right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
+
+ inter_section = tf.maximum(right_down - left_up, 0.0)
+ inter_area = inter_section[..., 0] * inter_section[..., 1]
+ union_area = boxes1_area + boxes2_area - inter_area
+ iou = inter_area / union_area
+
+ enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
+ enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
+ enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
+ enclose_area = enclose[..., 0] * enclose[..., 1]
+ giou = iou - 1.0 * (enclose_area - union_area) / enclose_area
+ mask = tf.cast(tf.greater(weight, 0),tf.float32)
+ mask = tf.stop_gradient(mask)
+
+ loss = (1 - giou)*mask
+ loss = tf.reduce_mean(loss)
+ return loss
+
+def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets, selected_class_targets,
params):
"""Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.
@@ -173,29 +196,10 @@ class and box losses from all levels.
class_loss = _fast_rcnn_class_loss(
class_outputs, class_targets_one_hot)
- # Selects the box from `box_outputs` based on `class_targets`, with which
- # the box has the maximum overlap.
- batch_size, num_rois, _ = box_outputs.get_shape().as_list()
- box_outputs = tf.reshape(box_outputs,
- [batch_size, num_rois, params['num_classes'], 4])
-
- box_indices = tf.reshape(
- class_targets + tf.tile(
- tf.expand_dims(
- tf.range(batch_size) * num_rois * params['num_classes'], 1),
- [1, num_rois]) + tf.tile(
- tf.expand_dims(tf.range(num_rois) * params['num_classes'], 0),
- [batch_size, 1]), [-1])
-
- box_outputs = tf.matmul(
- tf.one_hot(
- box_indices,
- batch_size * num_rois * params['num_classes'],
- dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
- box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
-
- box_loss = (params['fast_rcnn_box_loss_weight'] *
- _fast_rcnn_box_loss(box_outputs, box_targets, class_targets))
+
+ box_loss = (0.1*params['fast_rcnn_box_loss_weight'] *_fast_rcnn_box_loss(box_outputs, box_targets, selected_class_targets))
+ # box_loss = params['fast_rcnn_box_loss_weight'] * gIoU_loss(box_outputs,box_targets,selected_class_targets)
+
total_loss = class_loss + box_loss
return total_loss, class_loss, box_loss
diff --git a/models/official/mask_rcnn/losses_back.py b/models/official/mask_rcnn/losses_back.py
new file mode 100644
index 000000000..e6815ad7c
--- /dev/null
+++ b/models/official/mask_rcnn/losses_back.py
@@ -0,0 +1,239 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses used for Mask-RCNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+
+def _rpn_score_loss(score_outputs, score_targets, normalizer=1.0):
+ """Computes score loss."""
+ # score_targets has three values: (1) score_targets[i]=1, the anchor is a
+ # positive sample. (2) score_targets[i]=0, negative. (3) score_targets[i]=-1,
+ # the anchor is don't care (ignore).
+ with tf.name_scope('rpn_score_loss'):
+ mask = tf.logical_or(tf.equal(score_targets, 1), tf.equal(score_targets, 0))
+ score_targets = tf.maximum(score_targets, tf.zeros_like(score_targets))
+ # RPN score loss is sum over all except ignored samples.
+ score_loss = tf.losses.sigmoid_cross_entropy(
+ score_targets, score_outputs, weights=mask,
+ reduction=tf.losses.Reduction.SUM)
+ score_loss /= normalizer
+ return score_loss
+
+
+def _rpn_box_loss(box_outputs, box_targets, normalizer=1.0, delta=1./9):
+ """Computes box regression loss."""
+ # delta is typically around the mean value of regression target.
+ # for instances, the regression targets of 512x512 input with 6 anchors on
+ # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+ with tf.name_scope('rpn_box_loss'):
+ mask = tf.not_equal(box_targets, 0.0)
+ # The loss is normalized by the sum of non-zero weights before additional
+ # normalizer provided by the function caller.
+ box_loss = tf.losses.huber_loss(
+ box_targets,
+ box_outputs,
+ weights=mask,
+ delta=delta,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ box_loss /= normalizer
+ return box_loss
+
+
+def rpn_loss(score_outputs, box_outputs, labels, params):
+ """Computes total RPN detection loss.
+
+ Computes total RPN detection loss including box and score from all levels.
+ Args:
+ score_outputs: an OrderDict with keys representing levels and values
+ representing scores in [batch_size, height, width, num_anchors].
+ box_outputs: an OrderDict with keys representing levels and values
+ representing box regression targets in
+ [batch_size, height, width, num_anchors * 4].
+ labels: the dictionary that returned from dataloader that includes
+ groundturth targets.
+ params: the dictionary including training parameters specified in
+ default_haprams function in this file.
+ Returns:
+ total_rpn_loss: a float tensor representing total loss reduced from
+ score and box losses from all levels.
+ rpn_score_loss: a float tensor representing total score loss.
+ rpn_box_loss: a float tensor representing total box regression loss.
+ """
+ with tf.name_scope('rpn_loss'):
+ levels = score_outputs.keys()
+
+ score_losses = []
+ box_losses = []
+ for level in levels:
+ score_targets_at_level = labels['score_targets_%d' % level]
+ box_targets_at_level = labels['box_targets_%d' % level]
+ score_losses.append(
+ _rpn_score_loss(
+ score_outputs[level],
+ score_targets_at_level,
+ normalizer=tf.to_float(
+ params['batch_size'] * params['rpn_batch_size_per_im'])))
+ box_losses.append(
+ _rpn_box_loss(box_outputs[level], box_targets_at_level))
+
+ # Sum per level losses to total loss.
+ rpn_score_loss = tf.add_n(score_losses)
+ rpn_box_loss = params['rpn_box_loss_weight'] * tf.add_n(box_losses)
+ total_rpn_loss = rpn_score_loss + rpn_box_loss
+ return total_rpn_loss, rpn_score_loss, rpn_box_loss
+
+
+def _fast_rcnn_class_loss(class_outputs, class_targets_one_hot, normalizer=1.0):
+ """Computes classification loss."""
+ with tf.name_scope('fast_rcnn_class_loss'):
+ # The loss is normalized by the sum of non-zero weights before additional
+ # normalizer provided by the function caller.
+ class_loss = tf.losses.softmax_cross_entropy(
+ class_targets_one_hot, class_outputs,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ class_loss /= normalizer
+ return class_loss
+
+
+def _fast_rcnn_box_loss(box_outputs, box_targets, class_targets, normalizer=1.0,
+ delta=1.):
+ """Computes box regression loss."""
+ # delta is typically around the mean value of regression target.
+ # for instances, the regression targets of 512x512 input with 6 anchors on
+ # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+ with tf.name_scope('fast_rcnn_box_loss'):
+ mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
+ [1, 1, 4])
+ # The loss is normalized by the sum of non-zero weights before additional
+ # normalizer provided by the function caller.
+ box_loss = tf.losses.huber_loss(
+ box_targets,
+ box_outputs,
+ weights=mask,
+ delta=delta,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ box_loss /= normalizer
+ return box_loss
+
+
+def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets,
+ params):
+ """Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.
+
+ This function implements the classification and box regression loss of the
+ Fast-RCNN branch in Mask-RCNN. As the `box_outputs` produces `num_classes`
+ boxes for each RoI, the reference model expands `box_targets` to match the
+ shape of `box_outputs` and selects only the target that the RoI has a maximum
+ overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py) # pylint: disable=line-too-long
+ Instead, this function selects the `box_outputs` by the `class_targets` so
+ that it doesn't expand `box_targets`.
+
+ The loss computation has two parts: (1) classification loss is softmax on all
+ RoIs. (2) box loss is smooth L1-loss on only positive samples of RoIs.
+ Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
+
+
+ Args:
+ class_outputs: a float tensor representing the class prediction for each box
+ with a shape of [batch_size, num_boxes, num_classes].
+ box_outputs: a float tensor representing the box prediction for each box
+ with a shape of [batch_size, num_boxes, num_classes * 4].
+ class_targets: a float tensor representing the class label for each box
+ with a shape of [batch_size, num_boxes].
+ box_targets: a float tensor representing the box label for each box
+ with a shape of [batch_size, num_boxes, 4].
+ params: the dictionary including training parameters specified in
+ default_haprams function in this file.
+ Returns:
+ total_loss: a float tensor representing total loss reducing from
+ class and box losses from all levels.
+ cls_loss: a float tensor representing total class loss.
+ box_loss: a float tensor representing total box regression loss.
+ """
+ with tf.name_scope('fast_rcnn_loss'):
+ class_targets = tf.to_int32(class_targets)
+ class_targets_one_hot = tf.one_hot(class_targets, params['num_classes'])
+ class_loss = _fast_rcnn_class_loss(
+ class_outputs, class_targets_one_hot)
+
+ # Selects the box from `box_outputs` based on `class_targets`, with which
+ # the box has the maximum overlap.
+ batch_size, num_rois, _ = box_outputs.get_shape().as_list()
+ box_outputs = tf.reshape(box_outputs,
+ [batch_size, num_rois, params['num_classes'], 4])
+
+ box_indices = tf.reshape(
+ class_targets + tf.tile(
+ tf.expand_dims(
+ tf.range(batch_size) * num_rois * params['num_classes'], 1),
+ [1, num_rois]) + tf.tile(
+ tf.expand_dims(tf.range(num_rois) * params['num_classes'], 0),
+ [batch_size, 1]), [-1])
+
+ box_outputs = tf.matmul(
+ tf.one_hot(
+ box_indices,
+ batch_size * num_rois * params['num_classes'],
+ dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
+ box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
+
+ box_loss = (params['fast_rcnn_box_loss_weight'] *
+ _fast_rcnn_box_loss(box_outputs, box_targets, class_targets))
+ total_loss = class_loss + box_loss
+ return total_loss, class_loss, box_loss
+
+
+def mask_rcnn_loss(mask_outputs, mask_targets, select_class_targets, params):
+ """Computes the mask loss of Mask-RCNN.
+
+ This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
+ produces `num_classes` masks for each RoI, the reference model expands
+ `mask_targets` to match the shape of `mask_outputs` and selects only the
+ target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long
+ Instead, this implementation selects the `mask_outputs` by the `class_targets`
+ so that it doesn't expand `mask_targets`. Note that the selection logic is
+ done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
+
+ Args:
+ mask_outputs: a float tensor representing the prediction for each mask,
+ with a shape of
+ [batch_size, num_masks, mask_height, mask_width].
+ mask_targets: a float tensor representing the binary mask of ground truth
+ labels for each mask with a shape of
+ [batch_size, num_masks, mask_height, mask_width].
+ select_class_targets: a tensor with a shape of [batch_size, num_masks],
+ representing the foreground mask targets.
+ params: the dictionary including training parameters specified in
+ default_haprams function in this file.
+ Returns:
+ mask_loss: a float tensor representing total mask loss.
+ """
+ with tf.name_scope('mask_rcnn_loss'):
+ (batch_size, num_masks, mask_height,
+ mask_width) = mask_outputs.get_shape().as_list()
+
+ weights = tf.tile(
+ tf.reshape(tf.greater(select_class_targets, 0),
+ [batch_size, num_masks, 1, 1]),
+ [1, 1, mask_height, mask_width])
+ loss = tf.losses.sigmoid_cross_entropy(
+ mask_targets, mask_outputs, weights=weights,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ return params['mrcnn_weight_loss_mask'] * loss
diff --git a/models/official/mask_rcnn/losses_new.py b/models/official/mask_rcnn/losses_new.py
new file mode 100644
index 000000000..99652a75b
--- /dev/null
+++ b/models/official/mask_rcnn/losses_new.py
@@ -0,0 +1,243 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses used for Mask-RCNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+
+def _rpn_score_loss(score_outputs, score_targets, normalizer=1.0):
+ """Computes score loss."""
+ # score_targets has three values: (1) score_targets[i]=1, the anchor is a
+ # positive sample. (2) score_targets[i]=0, negative. (3) score_targets[i]=-1,
+ # the anchor is don't care (ignore).
+ with tf.name_scope('rpn_score_loss'):
+ mask = tf.logical_or(tf.equal(score_targets, 1), tf.equal(score_targets, 0))
+ score_targets = tf.maximum(score_targets, tf.zeros_like(score_targets))
+ # RPN score loss is sum over all except ignored samples.
+ score_loss = tf.losses.sigmoid_cross_entropy(
+ score_targets, score_outputs, weights=mask,
+ reduction=tf.losses.Reduction.SUM)
+ score_loss /= normalizer
+ return score_loss
+
+
+def _rpn_box_loss(box_outputs, box_targets, normalizer=1.0, delta=1./9):
+ """Computes box regression loss."""
+ # delta is typically around the mean value of regression target.
+ # for instances, the regression targets of 512x512 input with 6 anchors on
+ # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+ with tf.name_scope('rpn_box_loss'):
+ mask = tf.not_equal(box_targets, 0.0)
+ # The loss is normalized by the sum of non-zero weights before additional
+ # normalizer provided by the function caller.
+ box_loss = tf.losses.huber_loss(
+ box_targets,
+ box_outputs,
+ weights=mask,
+ delta=delta,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ box_loss /= normalizer
+ return box_loss
+
+
+def rpn_loss(score_outputs, box_outputs, labels, params):
+ """Computes total RPN detection loss.
+
+ Computes total RPN detection loss including box and score from all levels.
+ Args:
+ score_outputs: an OrderDict with keys representing levels and values
+ representing scores in [batch_size, height, width, num_anchors].
+ box_outputs: an OrderDict with keys representing levels and values
+ representing box regression targets in
+ [batch_size, height, width, num_anchors * 4].
+ labels: the dictionary that returned from dataloader that includes
+ groundturth targets.
+ params: the dictionary including training parameters specified in
+ default_haprams function in this file.
+ Returns:
+ total_rpn_loss: a float tensor representing total loss reduced from
+ score and box losses from all levels.
+ rpn_score_loss: a float tensor representing total score loss.
+ rpn_box_loss: a float tensor representing total box regression loss.
+ """
+ with tf.name_scope('rpn_loss'):
+ levels = score_outputs.keys()
+
+ score_losses = []
+ box_losses = []
+ for level in levels:
+ score_targets_at_level = labels['score_targets_%d' % level]
+ box_targets_at_level = labels['box_targets_%d' % level]
+ score_losses.append(
+ _rpn_score_loss(
+ score_outputs[level],
+ score_targets_at_level,
+ normalizer=tf.to_float(
+ params['batch_size'] * params['rpn_batch_size_per_im'])))
+ box_losses.append(
+ _rpn_box_loss(box_outputs[level], box_targets_at_level))
+
+ # Sum per level losses to total loss.
+ rpn_score_loss = tf.add_n(score_losses)
+ rpn_box_loss = params['rpn_box_loss_weight'] * tf.add_n(box_losses)
+ total_rpn_loss = rpn_score_loss + rpn_box_loss
+ return total_rpn_loss, rpn_score_loss, rpn_box_loss
+
+
+def _fast_rcnn_class_loss(class_outputs, class_targets_one_hot, normalizer=1.0):
+ """Computes classification loss."""
+ with tf.name_scope('fast_rcnn_class_loss'):
+ # The loss is normalized by the sum of non-zero weights before additional
+ # normalizer provided by the function caller.
+ class_loss = tf.losses.softmax_cross_entropy(
+ class_targets_one_hot, class_outputs,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ class_loss /= normalizer
+ return class_loss
+
+
+def _fast_rcnn_box_loss(box_outputs, box_targets, class_targets, normalizer=1.0,
+ delta=1.):
+ """Computes box regression loss."""
+ # delta is typically around the mean value of regression target.
+ # for instances, the regression targets of 512x512 input with 6 anchors on
+ # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+ with tf.name_scope('fast_rcnn_box_loss'):
+ mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
+ [1, 1, 4])
+ # The loss is normalized by the sum of non-zero weights before additional
+ # normalizer provided by the function caller.
+ box_loss = tf.losses.huber_loss(
+ box_targets,
+ box_outputs,
+ weights=mask,
+ delta=delta,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ box_loss /= normalizer
+ return box_loss
+
+
+def gIoU_loss(boxes1, boxes2,weight):
+ boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
+ boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
+ left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
+ right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
+
+ inter_section = tf.maximum(right_down - left_up, 0.0)
+ inter_area = inter_section[..., 0] * inter_section[..., 1]
+ union_area = boxes1_area + boxes2_area - inter_area
+ iou = inter_area / union_area
+
+ enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
+ enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
+ enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
+ enclose_area = enclose[..., 0] * enclose[..., 1]
+ giou = iou - 1.0 * (enclose_area - union_area) / enclose_area
+ mask = tf.cast(tf.greater(weight, 0),tf.float32)
+ mask = tf.stop_gradient(mask)
+
+ loss = (1 - giou)*mask
+ loss = tf.reduce_mean(loss)
+ return loss
+
+def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets, selected_class_targets,
+ params):
+ """Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.
+
+ This function implements the classification and box regression loss of the
+ Fast-RCNN branch in Mask-RCNN. As the `box_outputs` produces `num_classes`
+ boxes for each RoI, the reference model expands `box_targets` to match the
+ shape of `box_outputs` and selects only the target that the RoI has a maximum
+ overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py) # pylint: disable=line-too-long
+ Instead, this function selects the `box_outputs` by the `class_targets` so
+ that it doesn't expand `box_targets`.
+
+ The loss computation has two parts: (1) classification loss is softmax on all
+ RoIs. (2) box loss is smooth L1-loss on only positive samples of RoIs.
+ Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py # pylint: disable=line-too-long
+
+
+ Args:
+ class_outputs: a float tensor representing the class prediction for each box
+ with a shape of [batch_size, num_boxes, num_classes].
+ box_outputs: a float tensor representing the box prediction for each box
+ with a shape of [batch_size, num_boxes, num_classes * 4].
+ class_targets: a float tensor representing the class label for each box
+ with a shape of [batch_size, num_boxes].
+ box_targets: a float tensor representing the box label for each box
+ with a shape of [batch_size, num_boxes, 4].
+ params: the dictionary including training parameters specified in
+ default_haprams function in this file.
+ Returns:
+ total_loss: a float tensor representing total loss reducing from
+ class and box losses from all levels.
+ cls_loss: a float tensor representing total class loss.
+ box_loss: a float tensor representing total box regression loss.
+ """
+ with tf.name_scope('fast_rcnn_loss'):
+ class_targets = tf.to_int32(class_targets)
+ class_targets_one_hot = tf.one_hot(class_targets, params['num_classes'])
+ class_loss = _fast_rcnn_class_loss(
+ class_outputs, class_targets_one_hot)
+
+
+ box_loss = (0.1*params['fast_rcnn_box_loss_weight'] *_fast_rcnn_box_loss(box_outputs, box_targets, selected_class_targets))
+ # box_loss = params['fast_rcnn_box_loss_weight'] * gIoU_loss(box_outputs,box_targets,selected_class_targets)
+
+ total_loss = class_loss + box_loss
+ return total_loss, class_loss, box_loss
+
+
+def mask_rcnn_loss(mask_outputs, mask_targets, select_class_targets, params):
+ """Computes the mask loss of Mask-RCNN.
+
+ This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
+ produces `num_classes` masks for each RoI, the reference model expands
+ `mask_targets` to match the shape of `mask_outputs` and selects only the
+ target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py) # pylint: disable=line-too-long
+ Instead, this implementation selects the `mask_outputs` by the `class_targets`
+ so that it doesn't expand `mask_targets`. Note that the selection logic is
+ done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
+
+ Args:
+ mask_outputs: a float tensor representing the prediction for each mask,
+ with a shape of
+ [batch_size, num_masks, mask_height, mask_width].
+ mask_targets: a float tensor representing the binary mask of ground truth
+ labels for each mask with a shape of
+ [batch_size, num_masks, mask_height, mask_width].
+ select_class_targets: a tensor with a shape of [batch_size, num_masks],
+ representing the foreground mask targets.
+ params: the dictionary including training parameters specified in
+ default_haprams function in this file.
+ Returns:
+ mask_loss: a float tensor representing total mask loss.
+ """
+ with tf.name_scope('mask_rcnn_loss'):
+ (batch_size, num_masks, mask_height,
+ mask_width) = mask_outputs.get_shape().as_list()
+
+ weights = tf.tile(
+ tf.reshape(tf.greater(select_class_targets, 0),
+ [batch_size, num_masks, 1, 1]),
+ [1, 1, mask_height, mask_width])
+ loss = tf.losses.sigmoid_cross_entropy(
+ mask_targets, mask_outputs, weights=weights,
+ reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+ return params['mrcnn_weight_loss_mask'] * loss
diff --git a/models/official/mask_rcnn/mask_rcnn_model.py b/models/official/mask_rcnn/mask_rcnn_model.py
index 3c221169e..f6517f4f9 100644
--- a/models/official/mask_rcnn/mask_rcnn_model.py
+++ b/models/official/mask_rcnn/mask_rcnn_model.py
@@ -44,642 +44,755 @@
import spatial_transform_ops
import training_ops
import sys
+
sys.path.append('tpu/models/official/mnasnet')
-import mnasnet_models
+
+
+# import mnasnet_models
def create_optimizer(learning_rate, params):
- """Creates optimized based on the specified flags."""
- if params['optimizer'] == 'momentum':
- optimizer = tf.train.MomentumOptimizer(
- learning_rate, momentum=params['momentum'])
- elif params['optimizer'] == 'adam':
- optimizer = tf.train.AdamOptimizer(learning_rate)
- elif params['optimizer'] == 'adadelta':
- optimizer = tf.train.AdadeltaOptimizer(learning_rate)
- elif params['optimizer'] == 'adagrad':
- optimizer = tf.train.AdagradOptimizer(learning_rate)
- elif params['optimizer'] == 'rmsprop':
- optimizer = tf.train.RMSPropOptimizer(
- learning_rate, momentum=params['momentum'])
- elif params['optimizer'] == 'lars':
- try:
- from tensorflow.contrib.opt import LARSOptimizer # pylint: disable=g-import-not-at-top
-
- optimizer = LARSOptimizer(
- learning_rate,
- momentum=params['momentum'],
- weight_decay=params['lars_weight_decay'],
- skip_list=['batch_normalization', 'bias'])
- except ImportError as e:
- logging.exception('LARSOptimizer is currently not supported '
- 'in TensorFlow 2.x.')
- raise e
-
- else:
- raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
- return optimizer
+ """Creates optimized based on the specified flags."""
+ if params['optimizer'] == 'momentum':
+ optimizer = tf.train.MomentumOptimizer(
+ learning_rate, momentum=params['momentum'])
+ elif params['optimizer'] == 'adam':
+ optimizer = tf.train.AdamOptimizer(learning_rate)
+ elif params['optimizer'] == 'adadelta':
+ optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+ elif params['optimizer'] == 'adagrad':
+ optimizer = tf.train.AdagradOptimizer(learning_rate)
+ elif params['optimizer'] == 'rmsprop':
+ optimizer = tf.train.RMSPropOptimizer(
+ learning_rate, momentum=params['momentum'])
+ elif params['optimizer'] == 'lars':
+ try:
+ from tensorflow.contrib.opt import LARSOptimizer # pylint: disable=g-import-not-at-top
+
+ optimizer = LARSOptimizer(
+ learning_rate,
+ momentum=params['momentum'],
+ weight_decay=params['lars_weight_decay'],
+ skip_list=['batch_normalization', 'bias'])
+ except ImportError as e:
+ logging.exception('LARSOptimizer is currently not supported '
+ 'in TensorFlow 2.x.')
+ raise e
+
+ else:
+ raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
+ return optimizer
def remove_variables(variables, prefix):
- """Removes low-level variables from the input.
+ """Removes low-level variables from the input.
- Removing low-level parameters (e.g., initial convolution layer) from training
- usually leads to higher training speed and slightly better testing accuracy.
- The intuition is that the low-level architecture (e.g., ResNet-50) is able to
- capture low-level features such as edges; therefore, it does not need to be
- fine-tuned for the detection task.
+ Removing low-level parameters (e.g., initial convolution layer) from training
+ usually leads to higher training speed and slightly better testing accuracy.
+ The intuition is that the low-level architecture (e.g., ResNet-50) is able to
+ capture low-level features such as edges; therefore, it does not need to be
+ fine-tuned for the detection task.
- Args:
- variables: all the variables in training
- prefix: prefix of backbone
+ Args:
+ variables: all the variables in training
+ prefix: prefix of backbone
- Returns:
- var_list: a list containing variables for training
+ Returns:
+ var_list: a list containing variables for training
- """
- # Freeze at conv2 based on reference model.
- # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194 # pylint: disable=line-too-long
- remove_list = []
- remove_list.append(prefix + 'conv2d/')
- remove_list.append(prefix + 'batch_normalization/')
- for i in range(1, 11):
- remove_list.append(prefix + 'conv2d_{}/'.format(i))
- remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
+ """
+ # Freeze at conv2 based on reference model.
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194 # pylint: disable=line-too-long
+ remove_list = []
+ remove_list.append(prefix + 'conv2d/')
+ remove_list.append(prefix + 'batch_normalization/')
+ for i in range(1, 11):
+ remove_list.append(prefix + 'conv2d_{}/'.format(i))
+ remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
- def _is_kept(variable):
- for rm_str in remove_list:
- if rm_str in variable.name:
- return False
- return True
+ def _is_kept(variable):
+ for rm_str in remove_list:
+ if rm_str in variable.name:
+ return False
+ return True
- var_list = [v for v in variables if _is_kept(v)]
- return var_list
+ var_list = [v for v in variables if _is_kept(v)]
+ return var_list
def compute_model_statistics(batch_size, is_training=True):
- """Compute number of parameters and FLOPS."""
- num_trainable_params = np.sum(
- [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
- logging.info('number of trainable params: %d', num_trainable_params)
-
- options = tf.profiler.ProfileOptionBuilder.float_operation()
- options['output'] = 'none'
- flops = tf.profiler.profile(
- tf.get_default_graph(), options=options).total_float_ops
- flops_per_image = flops / batch_size
- if is_training:
- logging.info(
- 'number of FLOPS per image: %f in training', flops_per_image)
- else:
- logging.info(
- 'number of FLOPS per image: %f in eval', flops_per_image)
+ """Compute number of parameters and FLOPS."""
+ num_trainable_params = np.sum(
+ [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
+ logging.info('number of trainable params: %d', num_trainable_params)
+
+ options = tf.profiler.ProfileOptionBuilder.float_operation()
+ options['output'] = 'none'
+ flops = tf.profiler.profile(
+ tf.get_default_graph(), options=options).total_float_ops
+ flops_per_image = flops / batch_size
+ if is_training:
+ logging.info(
+ 'number of FLOPS per image: %f in training', flops_per_image)
+ else:
+ logging.info(
+ 'number of FLOPS per image: %f in eval', flops_per_image)
+
+
+def build_box_outputs(mask, params):
+ with tf.tpu.bfloat16_scope():
+ with tf.variable_scope('bbox_head_by_mask'):
+
+ def build_transform_variable(l_r, min_cost, image_size):
+ variable = []
+ if l_r == "right":
+ l_v = min_cost
+ r_v = 0
+ elif l_r == "left":
+ l_v = 0
+ r_v = min_cost
+ else:
+ raise ValueError('l_r must be left or right')
+
+ for i in range(image_size):
+ row = []
+ for k in range(i):
+ row.append(l_v)
+ row.append(1)
+ for k in range(image_size - 1 - i):
+ row.append(r_v)
+ variable.append(row)
+
+ return variable
+
+ def cal_offset(scope, input, img_size, alpha=1e-4):
+ with tf.variable_scope(scope):
+ c_left = tf.constant(build_transform_variable("left", -img_size * 2., img_size), dtype=tf.float32)
+ c_right = tf.constant(build_transform_variable("right", -img_size * 2., img_size), dtype=tf.float32)
+ if params['precision'] == 'bfloat16':
+ c_left = tf.cast(c_left, tf.bfloat16)
+ c_right = tf.cast(c_right, tf.bfloat16)
+ with tf.variable_scope("mask"):
+ if params['precision'] == 'bfloat16':
+ net = tf.cast(tf.greater(input, alpha), tf.bfloat16)
+ else:
+ net = tf.cast(tf.greater(input, alpha), tf.float32)
+
+ # for left
+ net_left = tf.nn.relu(tf.matmul(net, c_left))
+ mask_left = tf.stop_gradient(net_left, name="mask_left")
+
+ # for right
+ net_right = tf.nn.relu(tf.matmul(net, c_right))
+ mask_right = tf.stop_gradient(net_right, name="mask")
+
+ with tf.variable_scope("work"):
+ offset_left = 1 - input
+ left = offset_left + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+ left = tf.reduce_sum(left * mask_left, axis=-1)
+
+ right = input + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+ right = tf.reduce_sum(right * mask_right, axis=-1)
+
+ return left, right, [net_right, input]
+
+ batch_size, num_boxes, img_size, _ = mask.get_shape().as_list()
+ mask = tf.reshape(mask, [-1, img_size, img_size])
+ mask = tf.nn.dropout(mask, keep_prob=0.98)
+ mask = tf.clip_by_value(mask, 0, 1)
+ row = tf.reduce_max(mask, axis=2) # h
+ col = tf.reduce_max(mask, axis=1) # w
+ row_l, row_r, debug_row = cal_offset("cal_offset_row", row, img_size)
+ col_l, col_r, debug_col = cal_offset("cal_offset_col", col, img_size)
+ bbox = tf.stack([row_l, col_l, row_r, col_r], axis=-1)
+ bbox = tf.reshape(bbox, [batch_size, num_boxes, 4])
+
+ return bbox, img_size # y1,x1,y2,x2
def build_model_graph(features, labels, is_training, params):
- """Builds the forward model graph."""
- use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
- is_gpu_inference = (not is_training and use_batched_nms)
- model_outputs = {}
-
- if is_training and params['transpose_input']:
- if (params['backbone'].startswith('resnet') and
- params['conv0_space_to_depth_block_size'] > 0):
- features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+ """Builds the forward model graph."""
+ use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
+ is_gpu_inference = (not is_training and use_batched_nms)
+ model_outputs = {}
+
+ if is_training and params['transpose_input']:
+ if (params['backbone'].startswith('resnet') and
+ params['conv0_space_to_depth_block_size'] > 0):
+ features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+ else:
+ features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
+
+ batch_size, image_height, image_width, _ = (
+ features['images'].get_shape().as_list())
+
+ conv0_space_to_depth_block_size = 0
+ if (is_training and
+ (params['backbone'].startswith('resnet') and
+ params['conv0_space_to_depth_block_size'] > 0)):
+ conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
+ image_height *= conv0_space_to_depth_block_size
+ image_width *= conv0_space_to_depth_block_size
+
+ if 'source_ids' not in features:
+ features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
+
+ all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
+ params['num_scales'], params['aspect_ratios'],
+ params['anchor_scale'],
+ (image_height, image_width))
+
+ if 'resnet' in params['backbone']:
+ with tf.variable_scope(params['backbone']):
+ resnet_fn = resnet.resnet_v1(
+ params['backbone'],
+ conv0_kernel_size=params['conv0_kernel_size'],
+ conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
+ num_batch_norm_group=params['num_batch_norm_group'])
+ backbone_feats = resnet_fn(
+ features['images'],
+ (params['is_training_bn'] and is_training))
+ elif 'mnasnet' in params['backbone']:
+ with tf.variable_scope(params['backbone']):
+ _, endpoints = mnasnet_models.build_mnasnet_base(
+ features['images'],
+ params['backbone'],
+ training=(params['is_training_bn'] and is_training),
+ override_params={'use_keras': False})
+
+ backbone_feats = {
+ 2: endpoints['reduction_2'],
+ 3: endpoints['reduction_3'],
+ 4: endpoints['reduction_4'],
+ 5: endpoints['reduction_5'],
+ }
else:
- features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
-
- batch_size, image_height, image_width, _ = (
- features['images'].get_shape().as_list())
-
- conv0_space_to_depth_block_size = 0
- if (is_training and
- (params['backbone'].startswith('resnet') and
- params['conv0_space_to_depth_block_size'] > 0)):
- conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
- image_height *= conv0_space_to_depth_block_size
- image_width *= conv0_space_to_depth_block_size
-
- if 'source_ids' not in features:
- features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
-
- all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
- params['num_scales'], params['aspect_ratios'],
- params['anchor_scale'],
- (image_height, image_width))
-
- if 'resnet' in params['backbone']:
- with tf.variable_scope(params['backbone']):
- resnet_fn = resnet.resnet_v1(
- params['backbone'],
- conv0_kernel_size=params['conv0_kernel_size'],
- conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
- num_batch_norm_group=params['num_batch_norm_group'])
- backbone_feats = resnet_fn(
- features['images'],
- (params['is_training_bn'] and is_training))
- elif 'mnasnet' in params['backbone']:
- with tf.variable_scope(params['backbone']):
- _, endpoints = mnasnet_models.build_mnasnet_base(
- features['images'],
- params['backbone'],
- training=(params['is_training_bn'] and is_training),
- override_params={'use_keras': False})
-
- backbone_feats = {
- 2: endpoints['reduction_2'],
- 3: endpoints['reduction_3'],
- 4: endpoints['reduction_4'],
- 5: endpoints['reduction_5'],
- }
- else:
- raise ValueError('Not a valid backbone option: %s' % params['backbone'])
-
- fpn_feats = fpn.fpn(
- backbone_feats, params['min_level'], params['max_level'])
- model_outputs.update({
- 'fpn_features': fpn_feats,
- })
-
- rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
- fpn_feats,
- params['min_level'], params['max_level'],
- len(params['aspect_ratios'] * params['num_scales']))
-
- if is_training:
- rpn_pre_nms_topn = params['rpn_pre_nms_topn']
- rpn_post_nms_topn = params['rpn_post_nms_topn']
- else:
- rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
- rpn_post_nms_topn = params['test_rpn_post_nms_topn']
-
- rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
- rpn_score_outputs,
- rpn_box_outputs,
- all_anchors,
- features['image_info'],
- rpn_pre_nms_topn,
- rpn_post_nms_topn,
- params['rpn_nms_threshold'],
- params['rpn_min_size'],
- bbox_reg_weights=None,
- use_batched_nms=use_batched_nms)
- rpn_box_rois = tf.to_float(rpn_box_rois)
- if is_training:
- rpn_box_rois = tf.stop_gradient(rpn_box_rois)
- rpn_box_scores = tf.stop_gradient(rpn_box_scores)
-
- if is_training:
- # Sampling
- box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
- training_ops.proposal_label_op(
- rpn_box_rois,
- labels['gt_boxes'],
- labels['gt_classes'],
- batch_size_per_im=params['batch_size_per_im'],
- fg_fraction=params['fg_fraction'],
- fg_thresh=params['fg_thresh'],
- bg_thresh_hi=params['bg_thresh_hi'],
- bg_thresh_lo=params['bg_thresh_lo']))
-
- # Performs multi-level RoIAlign.
- box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
- fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
-
- class_outputs, box_outputs, _ = heads.box_head(
- box_roi_features, num_classes=params['num_classes'],
- mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
-
- if not is_training:
- if is_gpu_inference:
- generate_detections_fn = postprocess_ops.generate_detections_gpu
- else:
- generate_detections_fn = postprocess_ops.generate_detections_tpu
- detections = generate_detections_fn(
- class_outputs,
- box_outputs,
- rpn_box_rois,
- features['image_info'],
- params['test_rpn_post_nms_topn'],
- params['test_detections_per_image'],
- params['test_nms'],
- params['bbox_reg_weights'])
+ raise ValueError('Not a valid backbone option: %s' % params['backbone'])
+ fpn_feats = fpn.fpn(
+ backbone_feats, params['min_level'], params['max_level'])
model_outputs.update({
- 'num_detections': detections[0],
- 'detection_boxes': detections[1],
- 'detection_classes': detections[2],
- 'detection_scores': detections[3],
- })
- else:
- encoded_box_targets = training_ops.encode_box_targets(
- rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
- model_outputs.update({
- 'rpn_score_outputs': rpn_score_outputs,
- 'rpn_box_outputs': rpn_box_outputs,
- 'class_outputs': class_outputs,
- 'box_outputs': box_outputs,
- 'class_targets': class_targets,
- 'box_targets': encoded_box_targets,
- 'box_rois': rpn_box_rois,
+ 'fpn_features': fpn_feats,
})
- # Faster-RCNN mode.
- if not params['include_mask']:
+ rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
+ fpn_feats,
+ params['min_level'], params['max_level'],
+ len(params['aspect_ratios'] * params['num_scales']))
+
+ if is_training:
+ rpn_pre_nms_topn = params['rpn_pre_nms_topn']
+ rpn_post_nms_topn = params['rpn_post_nms_topn']
+ else:
+ rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
+ rpn_post_nms_topn = params['test_rpn_post_nms_topn']
+
+ # rpn_box_rois: [ymin, xmin, ymax, xmax]
+ rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
+ rpn_score_outputs,
+ rpn_box_outputs,
+ all_anchors,
+ features['image_info'],
+ rpn_pre_nms_topn,
+ rpn_post_nms_topn,
+ params['rpn_nms_threshold'],
+ params['rpn_min_size'],
+ bbox_reg_weights=None,
+ use_batched_nms=use_batched_nms)
+ rpn_box_rois = tf.to_float(rpn_box_rois)
+ if is_training:
+ rpn_box_rois = tf.stop_gradient(rpn_box_rois)
+ rpn_box_scores = tf.stop_gradient(rpn_box_scores)
+
+ if is_training:
+ # Sampling
+ box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
+ training_ops.proposal_label_op(
+ rpn_box_rois,
+ labels['gt_boxes'],
+ labels['gt_classes'],
+ batch_size_per_im=params['batch_size_per_im'],
+ fg_fraction=params['fg_fraction'],
+ fg_thresh=params['fg_thresh'],
+ bg_thresh_hi=params['bg_thresh_hi'],
+ bg_thresh_lo=params['bg_thresh_lo']))
+
+ # Performs multi-level RoIAlign.
+ box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+ fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
+
+ class_outputs, _, _ = heads.box_head(
+ box_roi_features, num_classes=params['num_classes'],
+ mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
+
+ if not is_training:
+ if is_gpu_inference:
+ generate_detections_fn = postprocess_ops.generate_detections_gpu
+ else:
+ generate_detections_fn = postprocess_ops.generate_detections_tpu
+ detections = generate_detections_fn(
+ class_outputs,
+ rpn_box_rois,
+ features['image_info'],
+ params['test_rpn_post_nms_topn'],
+ params['test_detections_per_image'],
+ params['test_nms'],
+ params['bbox_reg_weights'])
+
+ model_outputs.update({
+ 'num_detections': detections[0],
+ 'detection_boxes': detections[1],
+ 'detection_classes': detections[2],
+ 'detection_scores': detections[3],
+ })
+ else:
+ # encoded_box_targets = training_ops.encode_box_targets(
+ # rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
+ model_outputs.update({
+ 'rpn_score_outputs': rpn_score_outputs,
+ 'rpn_box_outputs': rpn_box_outputs,
+ 'class_outputs': class_outputs,
+ 'class_targets': class_targets,
+ # 'box_outputs': box_outputs,
+ # 'box_targets': box_targets,
+ # 'box_rois': rpn_box_rois,
+ })
+
+ # # Faster-RCNN mode.
+ # if not params['include_mask']:
+ # # Print #parameters and #FLOPs in model.
+ # compute_model_statistics(batch_size, is_training=is_training)
+ #
+ # return model_outputs
+
+ def expand_boxes(boxes, scale):
+ # whereas `boxes` here is in [y1, x1, y2, x2] form
+ w_half = (boxes[..., 3] - boxes[..., 1]) * .5
+ h_half = (boxes[..., 2] - boxes[..., 0]) * .5
+ x_c = boxes[..., 1] + w_half
+ y_c = boxes[..., 0] + h_half
+
+ w_half *= scale
+ h_half *= scale
+
+ boxes_exp = tf.stack([y_c - h_half, x_c - w_half, y_c + h_half, x_c + w_half], axis=-1)
+
+ return boxes_exp
+
+ # Mask sampling
+ if not is_training:
+ selected_box_rois = model_outputs['detection_boxes']
+ class_indices = model_outputs['detection_classes']
+ # If using GPU for inference, delay the cast until when Gather ops show up
+ # since GPU inference supports float point better.
+ # TODO(laigd): revisit this when newer versions of GPU libraries is
+ # released.
+ selected_box_rois = expand_boxes(selected_box_rois, 1.2)
+ if not is_gpu_inference:
+ class_indices = tf.to_int32(class_indices)
+ else:
+ (selected_class_targets, selected_box_targets, selected_box_rois,
+ proposal_to_label_map) = (
+ training_ops.select_fg_for_masks(
+ class_targets, box_targets, rpn_box_rois,
+ proposal_to_label_map,
+ max_num_fg=int(
+ params['batch_size_per_im'] * params['fg_fraction'])))
+
+ selected_box_rois = expand_boxes(selected_box_targets, 1.2)
+ class_indices = tf.to_int32(selected_class_targets)
+
+
+ import box_utils
+ selected_box_rois = box_utils.clip_boxes(selected_box_rois, 1024, 1024)
+ mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+ fpn_feats,
+ selected_box_rois,
+ output_size=14,
+ is_gpu_inference=is_gpu_inference)
+ mask_outputs = heads.mask_head(
+ mask_roi_features,
+ class_indices,
+ num_classes=params['num_classes'],
+ mrcnn_resolution=params['mrcnn_resolution'],
+ is_gpu_inference=is_gpu_inference)
+
+ soft_mask_outputs = tf.nn.sigmoid(mask_outputs)
+ box_outputs_in_mak, image_size = build_box_outputs(soft_mask_outputs, params)
+ offset = tf.stack(
+ [selected_box_rois[..., 0], selected_box_rois[..., 1], selected_box_rois[..., 0], selected_box_rois[..., 1]],
+ axis=-1)
+ h_scale = (selected_box_rois[..., 2] - selected_box_rois[..., 0]) / image_size
+ w_scale = (selected_box_rois[..., 3] - selected_box_rois[..., 1]) / image_size
+ wh_scale = tf.stack([h_scale, w_scale, h_scale, w_scale], axis=-1)
+ if params['precision'] == 'bfloat16':
+ wh_scale = tf.cast(wh_scale, tf.bfloat16)
+ offset = tf.cast(offset, tf.bfloat16)
+
+ box_outputs = offset + box_outputs_in_mak * wh_scale
+
# Print #parameters and #FLOPs in model.
compute_model_statistics(batch_size, is_training=is_training)
- return model_outputs
-
- # Mask sampling
- if not is_training:
- selected_box_rois = model_outputs['detection_boxes']
- class_indices = model_outputs['detection_classes']
- # If using GPU for inference, delay the cast until when Gather ops show up
- # since GPU inference supports float point better.
- # TODO(laigd): revisit this when newer versions of GPU libraries is
- # released.
- if not is_gpu_inference:
- class_indices = tf.to_int32(class_indices)
- else:
- (selected_class_targets, selected_box_targets, selected_box_rois,
- proposal_to_label_map) = (
- training_ops.select_fg_for_masks(
- class_targets, box_targets, rpn_box_rois,
- proposal_to_label_map,
- max_num_fg=int(
- params['batch_size_per_im'] * params['fg_fraction'])))
- class_indices = tf.to_int32(selected_class_targets)
-
- mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
- fpn_feats,
- selected_box_rois,
- output_size=14,
- is_gpu_inference=is_gpu_inference)
- mask_outputs = heads.mask_head(
- mask_roi_features,
- class_indices,
- num_classes=params['num_classes'],
- mrcnn_resolution=params['mrcnn_resolution'],
- is_gpu_inference=is_gpu_inference)
-
- # Print #parameters and #FLOPs in model.
- compute_model_statistics(batch_size, is_training=is_training)
-
- if is_training:
- mask_targets = training_ops.get_mask_targets(
- selected_box_rois, proposal_to_label_map, selected_box_targets,
- labels['cropped_gt_masks'], params['mrcnn_resolution'])
- model_outputs.update({
- 'mask_outputs': mask_outputs,
- 'mask_targets': mask_targets,
- 'selected_class_targets': selected_class_targets,
- })
- else:
- model_outputs.update({
- 'detection_masks': tf.nn.sigmoid(mask_outputs),
- })
+ if is_training:
+ mask_targets = training_ops.get_mask_targets(
+ selected_box_rois, proposal_to_label_map, selected_box_targets,
+ labels['cropped_gt_masks'], params['mrcnn_resolution'])
+ model_outputs.update({
+ 'mask_outputs': mask_outputs,
+ 'mask_targets': mask_targets,
+ 'selected_class_targets': selected_class_targets,
+ 'box_outputs': box_outputs,
+ 'box_targets': selected_box_targets,
+ })
+ else:
+ model_outputs.update({
+ 'detection_masks': soft_mask_outputs,
+ 'detection_boxes': box_outputs,
+ 'selected_box_rois': selected_box_rois
+ })
- return model_outputs
+ return model_outputs
def _build_assigment_map(optimizer, prefix=None, skip_variables_regex=None):
- """Generate assigment map for loading checkpoints."""
- optimizer_vars = set([var.name for var in optimizer.variables()])
- all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
- if not prefix:
- prefix = ''
- assignment_map = {}
- for var in all_vars:
- if var.name not in optimizer_vars:
- var_name = var.name
- # Trim the index of the variable.
- if ':' in var_name:
- var_name = var_name[:var_name.rindex(':')]
- if skip_variables_regex and re.match(skip_variables_regex,
- var_name[len(prefix):]):
- continue
- assignment_map[var_name[len(prefix):]] = var
- return assignment_map
+ """Generate assigment map for loading checkpoints."""
+ optimizer_vars = set([var.name for var in optimizer.variables()])
+ all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
+ if not prefix:
+ prefix = ''
+ assignment_map = {}
+ for var in all_vars:
+ if var.name not in optimizer_vars:
+ var_name = var.name
+ # Trim the index of the variable.
+ if ':' in var_name:
+ var_name = var_name[:var_name.rindex(':')]
+ if skip_variables_regex and re.match(skip_variables_regex,
+ var_name[len(prefix):]):
+ continue
+ assignment_map[var_name[len(prefix):]] = var
+ return assignment_map
def _model_fn(features, labels, mode, params, variable_filter_fn=None):
- """Model defination for the Mask-RCNN model based on ResNet.
-
- Args:
- features: the input image tensor and auxiliary information, such as
- `image_info` and `source_ids`. The image tensor has a shape of
- [batch_size, height, width, 3]. The height and width are fixed and equal.
- labels: the input labels in a dictionary. The labels include score targets
- and box targets which are dense label maps. The labels are generated from
- get_input_fn function in data/dataloader.py
- mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
- params: the dictionary defines hyperparameters of model. The default
- settings are in default_hparams function in this file.
- variable_filter_fn: the filter function that takes trainable_variables and
- returns the variable list after applying the filter rule.
-
- Returns:
- tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
- """
- if (mode == tf.estimator.ModeKeys.PREDICT or
- mode == tf.estimator.ModeKeys.EVAL):
- if ((params['include_groundtruth_in_features'] or
- mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
- # In include groundtruth for eval.
- labels = features['labels']
-
- if 'features' in features:
- features = features['features']
- # Otherwise, it is in export mode, the features is past in directly.
-
- if params['precision'] == 'bfloat16':
- with tf.tpu.bfloat16_scope():
- model_outputs = build_model_graph(features, labels,
- mode == tf.estimator.ModeKeys.TRAIN,
- params)
- model_outputs.update({
- 'source_id': features['source_ids'],
- 'image_info': features['image_info'],
- })
- def cast_outputs_to_float(d):
- for k, v in sorted(six.iteritems(d)):
- if isinstance(v, dict):
- cast_outputs_to_float(v)
- else:
- d[k] = tf.cast(v, tf.float32)
- cast_outputs_to_float(model_outputs)
- else:
- model_outputs = build_model_graph(features, labels,
- mode == tf.estimator.ModeKeys.TRAIN,
- params)
- model_outputs.update({
- 'source_id': features['source_ids'],
- 'image_info': features['image_info'],
- })
+ """Model defination for the Mask-RCNN model based on ResNet.
+
+ Args:
+ features: the input image tensor and auxiliary information, such as
+ `image_info` and `source_ids`. The image tensor has a shape of
+ [batch_size, height, width, 3]. The height and width are fixed and equal.
+ labels: the input labels in a dictionary. The labels include score targets
+ and box targets which are dense label maps. The labels are generated from
+ get_input_fn function in data/dataloader.py
+ mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+ params: the dictionary defines hyperparameters of model. The default
+ settings are in default_hparams function in this file.
+ variable_filter_fn: the filter function that takes trainable_variables and
+ returns the variable list after applying the filter rule.
+
+ Returns:
+ tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
+ """
+ if (mode == tf.estimator.ModeKeys.PREDICT or
+ mode == tf.estimator.ModeKeys.EVAL):
+ if ((params['include_groundtruth_in_features'] or
+ mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
+ # In include groundtruth for eval.
+ labels = features['labels']
+
+ if 'features' in features:
+ features = features['features']
+ # Otherwise, it is in export mode, the features is past in directly.
+
+ if params['precision'] == 'bfloat16':
+ with tf.tpu.bfloat16_scope():
+ model_outputs = build_model_graph(features, labels,
+ mode == tf.estimator.ModeKeys.TRAIN,
+ params)
+ model_outputs.update({
+ 'source_id': features['source_ids'],
+ 'image_info': features['image_info'],
+ })
+
+ def cast_outputs_to_float(d):
+ for k, v in sorted(six.iteritems(d)):
+ if isinstance(v, dict):
+ cast_outputs_to_float(v)
+ else:
+ d[k] = tf.cast(v, tf.float32)
+
+ cast_outputs_to_float(model_outputs)
+ else:
+ model_outputs = build_model_graph(features, labels,
+ mode == tf.estimator.ModeKeys.TRAIN,
+ params)
+ model_outputs.update({
+ 'source_id': features['source_ids'],
+ 'image_info': features['image_info'],
+ })
+
+ # First check if it is in PREDICT or EVAL mode to fill out predictions.
+ # Predictions are used during the eval step to generate metrics.
+ predictions = {}
+ if (mode == tf.estimator.ModeKeys.PREDICT or
+ mode == tf.estimator.ModeKeys.EVAL):
+ if 'orig_images' in features:
+ model_outputs['orig_images'] = features['orig_images']
+ if labels and params['include_groundtruth_in_features']:
+ # Labels can only be embedded in predictions. The predition cannot output
+ # dictionary as a value.
+ predictions.update(labels)
+ model_outputs.pop('fpn_features', None)
+ predictions.update(model_outputs)
+ # If we are doing PREDICT, we can return here.
+ if mode == tf.estimator.ModeKeys.PREDICT:
+ if params['use_tpu']:
+ return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
+ predictions=predictions)
+ return tf.estimator.EstimatorSpec(mode=mode,
+ predictions=predictions)
+
+ # Set up training loss and learning rate.
+ global_step = tf.train.get_or_create_global_step()
+ if params['learning_rate_type'] == 'step':
+ learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
+ global_step,
+ params['init_learning_rate'],
+ params['warmup_learning_rate'],
+ params['warmup_steps'],
+ params['learning_rate_levels'],
+ params['learning_rate_steps'])
+ elif params['learning_rate_type'] == 'cosine':
+ learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
+ global_step,
+ params['init_learning_rate'],
+ params['warmup_learning_rate'],
+ params['warmup_steps'],
+ params['total_steps'])
+ else:
+ raise ValueError('Unsupported learning rate type: `{}`!'
+ .format(params['learning_rate_type']))
+ # score_loss and box_loss are for logging. only total_loss is optimized.
+ total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
+ model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
+ labels, params)
+
+ (total_fast_rcnn_loss, fast_rcnn_class_loss,
+ fast_rcnn_box_loss) = losses.fast_rcnn_loss(
+ model_outputs['class_outputs'], model_outputs['box_outputs'],
+ model_outputs['class_targets'], model_outputs['box_targets'], model_outputs['selected_class_targets'], params)
+ # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py # pylint: disable=line-too-long
+ if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
+ mask_loss = losses.mask_rcnn_loss(
+ model_outputs['mask_outputs'], model_outputs['mask_targets'],
+ model_outputs['selected_class_targets'], params)
+ else:
+ mask_loss = 0.
+ if variable_filter_fn and ('resnet' in params['backbone']):
+ var_list = variable_filter_fn(tf.trainable_variables(),
+ params['backbone'] + '/')
+ else:
+ var_list = tf.trainable_variables()
+ l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
+ tf.nn.l2_loss(v)
+ for v in var_list
+ if 'batch_normalization' not in v.name and 'bias' not in v.name
+ ])
+ total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss)
+
+ host_call = None
+ if mode == tf.estimator.ModeKeys.TRAIN:
+ optimizer = create_optimizer(learning_rate, params)
+ if params['use_tpu']:
+ optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+
+ scaffold_fn = None
+ if params['warm_start_path']:
+
+ def warm_start_scaffold_fn():
+ logging.info(
+ 'model_fn warm start from: %s,', params['warm_start_path'])
+ assignment_map = _build_assigment_map(
+ optimizer,
+ prefix=None,
+ skip_variables_regex=params['skip_checkpoint_variables'])
+ tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
+ return tf.train.Scaffold()
+
+ scaffold_fn = warm_start_scaffold_fn
+
+ elif params['checkpoint']:
+
+ def backbone_scaffold_fn():
+ """Loads pretrained model through scaffold function."""
+ # Exclude all variable of optimizer.
+ vars_to_load = _build_assigment_map(
+ optimizer,
+ prefix=params['backbone'] + '/',
+ skip_variables_regex=params['skip_checkpoint_variables'])
+ tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
+ if not vars_to_load:
+ raise ValueError('Variables to load is empty.')
+ return tf.train.Scaffold()
+
+ scaffold_fn = backbone_scaffold_fn
+
+ # Batch norm requires update_ops to be added as a train_op dependency.
+ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+ grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
+ if params['global_gradient_clip_ratio'] > 0:
+ # Clips the gradients for training stability.
+ # Refer: https://arxiv.org/abs/1211.5063
+ with tf.name_scope('clipping'):
+ old_grads, variables = zip(*grads_and_vars)
+ num_weights = sum(
+ g.shape.num_elements() for g in old_grads if g is not None)
+ clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
+ num_weights)
+ logging.info(
+ 'Global clip norm set to %g for %d variables with %d elements.',
+ clip_norm, sum(1 for g in old_grads if g is not None),
+ num_weights)
+ gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
+ else:
+ gradients, variables = zip(*grads_and_vars)
+ grads_and_vars = []
+ # Special treatment for biases (beta is named as bias in reference model)
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113 # pylint: disable=line-too-long
+ for grad, var in zip(gradients, variables):
+ if grad is not None and ('beta' in var.name or 'bias' in var.name):
+ grad = 2.0 * grad
+ grads_and_vars.append((grad, var))
+
+ with tf.control_dependencies(update_ops):
+ train_op = optimizer.apply_gradients(
+ grads_and_vars, global_step=global_step)
+
+ if params['use_host_call']:
+ def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
+ rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
+ fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
+ learning_rate):
+ """Training host call. Creates scalar summaries for training metrics.
+
+ This function is executed on the CPU and should not directly reference
+ any Tensors in the rest of the `model_fn`. To pass Tensors from the
+ model to the `metric_fn`, provide as part of the `host_call`. See
+ https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
+ for more information.
+
+ Arguments should match the list of `Tensor` objects passed as the second
+ element in the tuple passed to `host_call`.
+
+ Args:
+ global_step: `Tensor with shape `[batch, ]` for the global_step.
+ total_loss: `Tensor` with shape `[batch, ]` for the training loss.
+ total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ loss.
+ rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ score loss.
+ rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ box loss.
+ total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN loss.
+ fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN class loss.
+ fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN box loss.
+ mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
+ mask loss.
+ l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
+ regularization loss.
+ learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
+
+ Returns:
+ List of summary ops to run on the CPU host.
+ """
+ # Outfeed supports int32 but global_step is expected to be int64.
+ global_step = tf.reduce_mean(global_step)
+ # Host call fns are executed FLAGS.iterations_per_loop times after one
+ # TPU loop is finished, setting max_queue value to the same as number of
+ # iterations will make the summary writer only flush the data to storage
+ # once per loop.
+ with (tf2.summary.create_file_writer(
+ params['model_dir'],
+ max_queue=params['iterations_per_loop']).as_default()):
+ with tf2.summary.record_if(True):
+ tf2.summary.scalar(
+ 'total_loss', tf.reduce_mean(total_loss), step=global_step)
+ tf2.summary.scalar(
+ 'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
+ tf2.summary.scalar(
+ 'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
+ step=global_step)
+ if params['include_mask']:
+ tf2.summary.scalar(
+ 'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
+ tf2.summary.scalar(
+ 'l2_regularization_loss',
+ tf.reduce_mean(l2_regularization_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'learning_rate', tf.reduce_mean(learning_rate),
+ step=global_step)
+
+ return tf.summary.all_v2_summary_ops()
+
+ # To log the loss, current learning rate, and epoch for Tensorboard, the
+ # summary op needs to be run on the host CPU via host_call. host_call
+ # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+ # dimension. These Tensors are implicitly concatenated to
+ # [params['batch_size']].
+ global_step_t = tf.reshape(global_step, [1])
+ total_loss_t = tf.reshape(total_loss, [1])
+ total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
+ rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
+ rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
+ total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
+ fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
+ fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
+ mask_loss_t = tf.reshape(mask_loss, [1])
+ l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
+ learning_rate_t = tf.reshape(learning_rate, [1])
+ host_call = (host_call_fn,
+ [global_step_t, total_loss_t, total_rpn_loss_t,
+ rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
+ fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
+ mask_loss_t, l2_regularization_loss, learning_rate_t])
+ else:
+ train_op = None
+ scaffold_fn = None
- # First check if it is in PREDICT or EVAL mode to fill out predictions.
- # Predictions are used during the eval step to generate metrics.
- predictions = {}
- if (mode == tf.estimator.ModeKeys.PREDICT or
- mode == tf.estimator.ModeKeys.EVAL):
- if 'orig_images' in features:
- model_outputs['orig_images'] = features['orig_images']
- if labels and params['include_groundtruth_in_features']:
- # Labels can only be embedded in predictions. The predition cannot output
- # dictionary as a value.
- predictions.update(labels)
- model_outputs.pop('fpn_features', None)
- predictions.update(model_outputs)
- # If we are doing PREDICT, we can return here.
- if mode == tf.estimator.ModeKeys.PREDICT:
- if params['use_tpu']:
- return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
- predictions=predictions)
- return tf.estimator.EstimatorSpec(mode=mode,
- predictions=predictions)
-
- # Set up training loss and learning rate.
- global_step = tf.train.get_or_create_global_step()
- if params['learning_rate_type'] == 'step':
- learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
- global_step,
- params['init_learning_rate'],
- params['warmup_learning_rate'],
- params['warmup_steps'],
- params['learning_rate_levels'],
- params['learning_rate_steps'])
- elif params['learning_rate_type'] == 'cosine':
- learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
- global_step,
- params['init_learning_rate'],
- params['warmup_learning_rate'],
- params['warmup_steps'],
- params['total_steps'])
- else:
- raise ValueError('Unsupported learning rate type: `{}`!'
- .format(params['learning_rate_type']))
- # score_loss and box_loss are for logging. only total_loss is optimized.
- total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
- model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
- labels, params)
-
- (total_fast_rcnn_loss, fast_rcnn_class_loss,
- fast_rcnn_box_loss) = losses.fast_rcnn_loss(
- model_outputs['class_outputs'], model_outputs['box_outputs'],
- model_outputs['class_targets'], model_outputs['box_targets'], params)
- # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py # pylint: disable=line-too-long
- if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
- mask_loss = losses.mask_rcnn_loss(
- model_outputs['mask_outputs'], model_outputs['mask_targets'],
- model_outputs['selected_class_targets'], params)
- else:
- mask_loss = 0.
- if variable_filter_fn and ('resnet' in params['backbone']):
- var_list = variable_filter_fn(tf.trainable_variables(),
- params['backbone'] + '/')
- else:
- var_list = tf.trainable_variables()
- l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
- tf.nn.l2_loss(v)
- for v in var_list
- if 'batch_normalization' not in v.name and 'bias' not in v.name
- ])
- total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss +
- l2_regularization_loss)
-
- host_call = None
- if mode == tf.estimator.ModeKeys.TRAIN:
- optimizer = create_optimizer(learning_rate, params)
if params['use_tpu']:
- optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-
- scaffold_fn = None
- if params['warm_start_path']:
-
- def warm_start_scaffold_fn():
- logging.info(
- 'model_fn warm start from: %s,', params['warm_start_path'])
- assignment_map = _build_assigment_map(
- optimizer,
- prefix=None,
- skip_variables_regex=params['skip_checkpoint_variables'])
- tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
- return tf.train.Scaffold()
-
- scaffold_fn = warm_start_scaffold_fn
-
- elif params['checkpoint']:
-
- def backbone_scaffold_fn():
- """Loads pretrained model through scaffold function."""
- # Exclude all variable of optimizer.
- vars_to_load = _build_assigment_map(
- optimizer,
- prefix=params['backbone'] + '/',
- skip_variables_regex=params['skip_checkpoint_variables'])
- tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
- if not vars_to_load:
- raise ValueError('Variables to load is empty.')
- return tf.train.Scaffold()
-
- scaffold_fn = backbone_scaffold_fn
-
- # Batch norm requires update_ops to be added as a train_op dependency.
- update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
- grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
- if params['global_gradient_clip_ratio'] > 0:
- # Clips the gradients for training stability.
- # Refer: https://arxiv.org/abs/1211.5063
- with tf.name_scope('clipping'):
- old_grads, variables = zip(*grads_and_vars)
- num_weights = sum(
- g.shape.num_elements() for g in old_grads if g is not None)
- clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
- num_weights)
- logging.info(
- 'Global clip norm set to %g for %d variables with %d elements.',
- clip_norm, sum(1 for g in old_grads if g is not None),
- num_weights)
- gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
- else:
- gradients, variables = zip(*grads_and_vars)
- grads_and_vars = []
- # Special treatment for biases (beta is named as bias in reference model)
- # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113 # pylint: disable=line-too-long
- for grad, var in zip(gradients, variables):
- if grad is not None and ('beta' in var.name or 'bias' in var.name):
- grad = 2.0 * grad
- grads_and_vars.append((grad, var))
-
- with tf.control_dependencies(update_ops):
- train_op = optimizer.apply_gradients(
- grads_and_vars, global_step=global_step)
-
- if params['use_host_call']:
- def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
- rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
- fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
- learning_rate):
- """Training host call. Creates scalar summaries for training metrics.
-
- This function is executed on the CPU and should not directly reference
- any Tensors in the rest of the `model_fn`. To pass Tensors from the
- model to the `metric_fn`, provide as part of the `host_call`. See
- https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
- for more information.
-
- Arguments should match the list of `Tensor` objects passed as the second
- element in the tuple passed to `host_call`.
-
- Args:
- global_step: `Tensor with shape `[batch, ]` for the global_step.
- total_loss: `Tensor` with shape `[batch, ]` for the training loss.
- total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
- loss.
- rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
- score loss.
- rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
- box loss.
- total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
- training Mask-RCNN loss.
- fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
- training Mask-RCNN class loss.
- fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
- training Mask-RCNN box loss.
- mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
- mask loss.
- l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
- regularization loss.
- learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
-
- Returns:
- List of summary ops to run on the CPU host.
- """
- # Outfeed supports int32 but global_step is expected to be int64.
- global_step = tf.reduce_mean(global_step)
- # Host call fns are executed FLAGS.iterations_per_loop times after one
- # TPU loop is finished, setting max_queue value to the same as number of
- # iterations will make the summary writer only flush the data to storage
- # once per loop.
- with (tf2.summary.create_file_writer(
- params['model_dir'],
- max_queue=params['iterations_per_loop']).as_default()):
- with tf2.summary.record_if(True):
- tf2.summary.scalar(
- 'total_loss', tf.reduce_mean(total_loss), step=global_step)
- tf2.summary.scalar(
- 'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
- step=global_step)
- tf2.summary.scalar(
- 'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
- step=global_step)
- tf2.summary.scalar(
- 'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
- tf2.summary.scalar(
- 'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
- step=global_step)
- tf2.summary.scalar(
- 'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
- step=global_step)
- tf2.summary.scalar(
- 'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
- step=global_step)
- if params['include_mask']:
- tf2.summary.scalar(
- 'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
- tf2.summary.scalar(
- 'l2_regularization_loss',
- tf.reduce_mean(l2_regularization_loss),
- step=global_step)
- tf2.summary.scalar(
- 'learning_rate', tf.reduce_mean(learning_rate),
- step=global_step)
-
- return tf.summary.all_v2_summary_ops()
-
- # To log the loss, current learning rate, and epoch for Tensorboard, the
- # summary op needs to be run on the host CPU via host_call. host_call
- # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
- # dimension. These Tensors are implicitly concatenated to
- # [params['batch_size']].
- global_step_t = tf.reshape(global_step, [1])
- total_loss_t = tf.reshape(total_loss, [1])
- total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
- rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
- rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
- total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
- fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
- fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
- mask_loss_t = tf.reshape(mask_loss, [1])
- l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
- learning_rate_t = tf.reshape(learning_rate, [1])
- host_call = (host_call_fn,
- [global_step_t, total_loss_t, total_rpn_loss_t,
- rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
- fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
- mask_loss_t, l2_regularization_loss, learning_rate_t])
- else:
- train_op = None
- scaffold_fn = None
-
- if params['use_tpu']:
- return tf.estimator.tpu.TPUEstimatorSpec(
- mode=mode,
- loss=total_loss,
- train_op=train_op,
- host_call=host_call,
- scaffold_fn=scaffold_fn)
- return tf.estimator.EstimatorSpec(
- mode=mode, loss=total_loss, train_op=train_op)
+ return tf.estimator.tpu.TPUEstimatorSpec(
+ mode=mode,
+ loss=total_loss,
+ train_op=train_op,
+ host_call=host_call,
+ scaffold_fn=scaffold_fn)
+ return tf.estimator.EstimatorSpec(
+ mode=mode, loss=total_loss, train_op=train_op)
def mask_rcnn_model_fn(features, labels, mode, params):
- """Mask-RCNN model."""
- with tf.variable_scope('', reuse=tf.AUTO_REUSE):
- return _model_fn(
- features,
- labels,
- mode,
- params,
- variable_filter_fn=remove_variables)
+ """Mask-RCNN model."""
+ with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+ return _model_fn(
+ features,
+ labels,
+ mode,
+ params,
+ variable_filter_fn=remove_variables)
diff --git a/models/official/mask_rcnn/mask_rcnn_model_back.py b/models/official/mask_rcnn/mask_rcnn_model_back.py
new file mode 100644
index 000000000..3c221169e
--- /dev/null
+++ b/models/official/mask_rcnn/mask_rcnn_model_back.py
@@ -0,0 +1,685 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the Mask-RCNN Model.
+
+Defines model_fn of Mask-RCNN for TF Estimator. The model_fn includes Mask-RCNN
+model architecture, loss function, learning rate schedule, and evaluation
+procedure.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import re
+from absl import logging
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf2
+
+import anchors
+import fpn
+import heads
+import learning_rates
+import losses
+import postprocess_ops
+import resnet
+import roi_ops
+import spatial_transform_ops
+import training_ops
+import sys
+sys.path.append('tpu/models/official/mnasnet')
+import mnasnet_models
+
+
+def create_optimizer(learning_rate, params):
+ """Creates optimized based on the specified flags."""
+ if params['optimizer'] == 'momentum':
+ optimizer = tf.train.MomentumOptimizer(
+ learning_rate, momentum=params['momentum'])
+ elif params['optimizer'] == 'adam':
+ optimizer = tf.train.AdamOptimizer(learning_rate)
+ elif params['optimizer'] == 'adadelta':
+ optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+ elif params['optimizer'] == 'adagrad':
+ optimizer = tf.train.AdagradOptimizer(learning_rate)
+ elif params['optimizer'] == 'rmsprop':
+ optimizer = tf.train.RMSPropOptimizer(
+ learning_rate, momentum=params['momentum'])
+ elif params['optimizer'] == 'lars':
+ try:
+ from tensorflow.contrib.opt import LARSOptimizer # pylint: disable=g-import-not-at-top
+
+ optimizer = LARSOptimizer(
+ learning_rate,
+ momentum=params['momentum'],
+ weight_decay=params['lars_weight_decay'],
+ skip_list=['batch_normalization', 'bias'])
+ except ImportError as e:
+ logging.exception('LARSOptimizer is currently not supported '
+ 'in TensorFlow 2.x.')
+ raise e
+
+ else:
+ raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
+ return optimizer
+
+
+def remove_variables(variables, prefix):
+ """Removes low-level variables from the input.
+
+ Removing low-level parameters (e.g., initial convolution layer) from training
+ usually leads to higher training speed and slightly better testing accuracy.
+ The intuition is that the low-level architecture (e.g., ResNet-50) is able to
+ capture low-level features such as edges; therefore, it does not need to be
+ fine-tuned for the detection task.
+
+ Args:
+ variables: all the variables in training
+ prefix: prefix of backbone
+
+ Returns:
+ var_list: a list containing variables for training
+
+ """
+ # Freeze at conv2 based on reference model.
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194 # pylint: disable=line-too-long
+ remove_list = []
+ remove_list.append(prefix + 'conv2d/')
+ remove_list.append(prefix + 'batch_normalization/')
+ for i in range(1, 11):
+ remove_list.append(prefix + 'conv2d_{}/'.format(i))
+ remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
+
+ def _is_kept(variable):
+ for rm_str in remove_list:
+ if rm_str in variable.name:
+ return False
+ return True
+
+ var_list = [v for v in variables if _is_kept(v)]
+ return var_list
+
+
+def compute_model_statistics(batch_size, is_training=True):
+ """Compute number of parameters and FLOPS."""
+ num_trainable_params = np.sum(
+ [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
+ logging.info('number of trainable params: %d', num_trainable_params)
+
+ options = tf.profiler.ProfileOptionBuilder.float_operation()
+ options['output'] = 'none'
+ flops = tf.profiler.profile(
+ tf.get_default_graph(), options=options).total_float_ops
+ flops_per_image = flops / batch_size
+ if is_training:
+ logging.info(
+ 'number of FLOPS per image: %f in training', flops_per_image)
+ else:
+ logging.info(
+ 'number of FLOPS per image: %f in eval', flops_per_image)
+
+
+def build_model_graph(features, labels, is_training, params):
+ """Builds the forward model graph."""
+ use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
+ is_gpu_inference = (not is_training and use_batched_nms)
+ model_outputs = {}
+
+ if is_training and params['transpose_input']:
+ if (params['backbone'].startswith('resnet') and
+ params['conv0_space_to_depth_block_size'] > 0):
+ features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+ else:
+ features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
+
+ batch_size, image_height, image_width, _ = (
+ features['images'].get_shape().as_list())
+
+ conv0_space_to_depth_block_size = 0
+ if (is_training and
+ (params['backbone'].startswith('resnet') and
+ params['conv0_space_to_depth_block_size'] > 0)):
+ conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
+ image_height *= conv0_space_to_depth_block_size
+ image_width *= conv0_space_to_depth_block_size
+
+ if 'source_ids' not in features:
+ features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
+
+ all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
+ params['num_scales'], params['aspect_ratios'],
+ params['anchor_scale'],
+ (image_height, image_width))
+
+ if 'resnet' in params['backbone']:
+ with tf.variable_scope(params['backbone']):
+ resnet_fn = resnet.resnet_v1(
+ params['backbone'],
+ conv0_kernel_size=params['conv0_kernel_size'],
+ conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
+ num_batch_norm_group=params['num_batch_norm_group'])
+ backbone_feats = resnet_fn(
+ features['images'],
+ (params['is_training_bn'] and is_training))
+ elif 'mnasnet' in params['backbone']:
+ with tf.variable_scope(params['backbone']):
+ _, endpoints = mnasnet_models.build_mnasnet_base(
+ features['images'],
+ params['backbone'],
+ training=(params['is_training_bn'] and is_training),
+ override_params={'use_keras': False})
+
+ backbone_feats = {
+ 2: endpoints['reduction_2'],
+ 3: endpoints['reduction_3'],
+ 4: endpoints['reduction_4'],
+ 5: endpoints['reduction_5'],
+ }
+ else:
+ raise ValueError('Not a valid backbone option: %s' % params['backbone'])
+
+ fpn_feats = fpn.fpn(
+ backbone_feats, params['min_level'], params['max_level'])
+ model_outputs.update({
+ 'fpn_features': fpn_feats,
+ })
+
+ rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
+ fpn_feats,
+ params['min_level'], params['max_level'],
+ len(params['aspect_ratios'] * params['num_scales']))
+
+ if is_training:
+ rpn_pre_nms_topn = params['rpn_pre_nms_topn']
+ rpn_post_nms_topn = params['rpn_post_nms_topn']
+ else:
+ rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
+ rpn_post_nms_topn = params['test_rpn_post_nms_topn']
+
+ rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
+ rpn_score_outputs,
+ rpn_box_outputs,
+ all_anchors,
+ features['image_info'],
+ rpn_pre_nms_topn,
+ rpn_post_nms_topn,
+ params['rpn_nms_threshold'],
+ params['rpn_min_size'],
+ bbox_reg_weights=None,
+ use_batched_nms=use_batched_nms)
+ rpn_box_rois = tf.to_float(rpn_box_rois)
+ if is_training:
+ rpn_box_rois = tf.stop_gradient(rpn_box_rois)
+ rpn_box_scores = tf.stop_gradient(rpn_box_scores)
+
+ if is_training:
+ # Sampling
+ box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
+ training_ops.proposal_label_op(
+ rpn_box_rois,
+ labels['gt_boxes'],
+ labels['gt_classes'],
+ batch_size_per_im=params['batch_size_per_im'],
+ fg_fraction=params['fg_fraction'],
+ fg_thresh=params['fg_thresh'],
+ bg_thresh_hi=params['bg_thresh_hi'],
+ bg_thresh_lo=params['bg_thresh_lo']))
+
+ # Performs multi-level RoIAlign.
+ box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+ fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
+
+ class_outputs, box_outputs, _ = heads.box_head(
+ box_roi_features, num_classes=params['num_classes'],
+ mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
+
+ if not is_training:
+ if is_gpu_inference:
+ generate_detections_fn = postprocess_ops.generate_detections_gpu
+ else:
+ generate_detections_fn = postprocess_ops.generate_detections_tpu
+ detections = generate_detections_fn(
+ class_outputs,
+ box_outputs,
+ rpn_box_rois,
+ features['image_info'],
+ params['test_rpn_post_nms_topn'],
+ params['test_detections_per_image'],
+ params['test_nms'],
+ params['bbox_reg_weights'])
+
+ model_outputs.update({
+ 'num_detections': detections[0],
+ 'detection_boxes': detections[1],
+ 'detection_classes': detections[2],
+ 'detection_scores': detections[3],
+ })
+ else:
+ encoded_box_targets = training_ops.encode_box_targets(
+ rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
+ model_outputs.update({
+ 'rpn_score_outputs': rpn_score_outputs,
+ 'rpn_box_outputs': rpn_box_outputs,
+ 'class_outputs': class_outputs,
+ 'box_outputs': box_outputs,
+ 'class_targets': class_targets,
+ 'box_targets': encoded_box_targets,
+ 'box_rois': rpn_box_rois,
+ })
+
+ # Faster-RCNN mode.
+ if not params['include_mask']:
+ # Print #parameters and #FLOPs in model.
+ compute_model_statistics(batch_size, is_training=is_training)
+
+ return model_outputs
+
+ # Mask sampling
+ if not is_training:
+ selected_box_rois = model_outputs['detection_boxes']
+ class_indices = model_outputs['detection_classes']
+ # If using GPU for inference, delay the cast until when Gather ops show up
+ # since GPU inference supports float point better.
+ # TODO(laigd): revisit this when newer versions of GPU libraries is
+ # released.
+ if not is_gpu_inference:
+ class_indices = tf.to_int32(class_indices)
+ else:
+ (selected_class_targets, selected_box_targets, selected_box_rois,
+ proposal_to_label_map) = (
+ training_ops.select_fg_for_masks(
+ class_targets, box_targets, rpn_box_rois,
+ proposal_to_label_map,
+ max_num_fg=int(
+ params['batch_size_per_im'] * params['fg_fraction'])))
+ class_indices = tf.to_int32(selected_class_targets)
+
+ mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+ fpn_feats,
+ selected_box_rois,
+ output_size=14,
+ is_gpu_inference=is_gpu_inference)
+ mask_outputs = heads.mask_head(
+ mask_roi_features,
+ class_indices,
+ num_classes=params['num_classes'],
+ mrcnn_resolution=params['mrcnn_resolution'],
+ is_gpu_inference=is_gpu_inference)
+
+ # Print #parameters and #FLOPs in model.
+ compute_model_statistics(batch_size, is_training=is_training)
+
+ if is_training:
+ mask_targets = training_ops.get_mask_targets(
+ selected_box_rois, proposal_to_label_map, selected_box_targets,
+ labels['cropped_gt_masks'], params['mrcnn_resolution'])
+ model_outputs.update({
+ 'mask_outputs': mask_outputs,
+ 'mask_targets': mask_targets,
+ 'selected_class_targets': selected_class_targets,
+ })
+ else:
+ model_outputs.update({
+ 'detection_masks': tf.nn.sigmoid(mask_outputs),
+ })
+
+ return model_outputs
+
+
+def _build_assigment_map(optimizer, prefix=None, skip_variables_regex=None):
+ """Generate assigment map for loading checkpoints."""
+ optimizer_vars = set([var.name for var in optimizer.variables()])
+ all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
+ if not prefix:
+ prefix = ''
+ assignment_map = {}
+ for var in all_vars:
+ if var.name not in optimizer_vars:
+ var_name = var.name
+ # Trim the index of the variable.
+ if ':' in var_name:
+ var_name = var_name[:var_name.rindex(':')]
+ if skip_variables_regex and re.match(skip_variables_regex,
+ var_name[len(prefix):]):
+ continue
+ assignment_map[var_name[len(prefix):]] = var
+ return assignment_map
+
+
+def _model_fn(features, labels, mode, params, variable_filter_fn=None):
+ """Model defination for the Mask-RCNN model based on ResNet.
+
+ Args:
+ features: the input image tensor and auxiliary information, such as
+ `image_info` and `source_ids`. The image tensor has a shape of
+ [batch_size, height, width, 3]. The height and width are fixed and equal.
+ labels: the input labels in a dictionary. The labels include score targets
+ and box targets which are dense label maps. The labels are generated from
+ get_input_fn function in data/dataloader.py
+ mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+ params: the dictionary defines hyperparameters of model. The default
+ settings are in default_hparams function in this file.
+ variable_filter_fn: the filter function that takes trainable_variables and
+ returns the variable list after applying the filter rule.
+
+ Returns:
+ tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
+ """
+ if (mode == tf.estimator.ModeKeys.PREDICT or
+ mode == tf.estimator.ModeKeys.EVAL):
+ if ((params['include_groundtruth_in_features'] or
+ mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
+ # In include groundtruth for eval.
+ labels = features['labels']
+
+ if 'features' in features:
+ features = features['features']
+ # Otherwise, it is in export mode, the features is past in directly.
+
+ if params['precision'] == 'bfloat16':
+ with tf.tpu.bfloat16_scope():
+ model_outputs = build_model_graph(features, labels,
+ mode == tf.estimator.ModeKeys.TRAIN,
+ params)
+ model_outputs.update({
+ 'source_id': features['source_ids'],
+ 'image_info': features['image_info'],
+ })
+ def cast_outputs_to_float(d):
+ for k, v in sorted(six.iteritems(d)):
+ if isinstance(v, dict):
+ cast_outputs_to_float(v)
+ else:
+ d[k] = tf.cast(v, tf.float32)
+ cast_outputs_to_float(model_outputs)
+ else:
+ model_outputs = build_model_graph(features, labels,
+ mode == tf.estimator.ModeKeys.TRAIN,
+ params)
+ model_outputs.update({
+ 'source_id': features['source_ids'],
+ 'image_info': features['image_info'],
+ })
+
+ # First check if it is in PREDICT or EVAL mode to fill out predictions.
+ # Predictions are used during the eval step to generate metrics.
+ predictions = {}
+ if (mode == tf.estimator.ModeKeys.PREDICT or
+ mode == tf.estimator.ModeKeys.EVAL):
+ if 'orig_images' in features:
+ model_outputs['orig_images'] = features['orig_images']
+ if labels and params['include_groundtruth_in_features']:
+ # Labels can only be embedded in predictions. The predition cannot output
+ # dictionary as a value.
+ predictions.update(labels)
+ model_outputs.pop('fpn_features', None)
+ predictions.update(model_outputs)
+ # If we are doing PREDICT, we can return here.
+ if mode == tf.estimator.ModeKeys.PREDICT:
+ if params['use_tpu']:
+ return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
+ predictions=predictions)
+ return tf.estimator.EstimatorSpec(mode=mode,
+ predictions=predictions)
+
+ # Set up training loss and learning rate.
+ global_step = tf.train.get_or_create_global_step()
+ if params['learning_rate_type'] == 'step':
+ learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
+ global_step,
+ params['init_learning_rate'],
+ params['warmup_learning_rate'],
+ params['warmup_steps'],
+ params['learning_rate_levels'],
+ params['learning_rate_steps'])
+ elif params['learning_rate_type'] == 'cosine':
+ learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
+ global_step,
+ params['init_learning_rate'],
+ params['warmup_learning_rate'],
+ params['warmup_steps'],
+ params['total_steps'])
+ else:
+ raise ValueError('Unsupported learning rate type: `{}`!'
+ .format(params['learning_rate_type']))
+ # score_loss and box_loss are for logging. only total_loss is optimized.
+ total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
+ model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
+ labels, params)
+
+ (total_fast_rcnn_loss, fast_rcnn_class_loss,
+ fast_rcnn_box_loss) = losses.fast_rcnn_loss(
+ model_outputs['class_outputs'], model_outputs['box_outputs'],
+ model_outputs['class_targets'], model_outputs['box_targets'], params)
+ # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py # pylint: disable=line-too-long
+ if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
+ mask_loss = losses.mask_rcnn_loss(
+ model_outputs['mask_outputs'], model_outputs['mask_targets'],
+ model_outputs['selected_class_targets'], params)
+ else:
+ mask_loss = 0.
+ if variable_filter_fn and ('resnet' in params['backbone']):
+ var_list = variable_filter_fn(tf.trainable_variables(),
+ params['backbone'] + '/')
+ else:
+ var_list = tf.trainable_variables()
+ l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
+ tf.nn.l2_loss(v)
+ for v in var_list
+ if 'batch_normalization' not in v.name and 'bias' not in v.name
+ ])
+ total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss +
+ l2_regularization_loss)
+
+ host_call = None
+ if mode == tf.estimator.ModeKeys.TRAIN:
+ optimizer = create_optimizer(learning_rate, params)
+ if params['use_tpu']:
+ optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+
+ scaffold_fn = None
+ if params['warm_start_path']:
+
+ def warm_start_scaffold_fn():
+ logging.info(
+ 'model_fn warm start from: %s,', params['warm_start_path'])
+ assignment_map = _build_assigment_map(
+ optimizer,
+ prefix=None,
+ skip_variables_regex=params['skip_checkpoint_variables'])
+ tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
+ return tf.train.Scaffold()
+
+ scaffold_fn = warm_start_scaffold_fn
+
+ elif params['checkpoint']:
+
+ def backbone_scaffold_fn():
+ """Loads pretrained model through scaffold function."""
+ # Exclude all variable of optimizer.
+ vars_to_load = _build_assigment_map(
+ optimizer,
+ prefix=params['backbone'] + '/',
+ skip_variables_regex=params['skip_checkpoint_variables'])
+ tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
+ if not vars_to_load:
+ raise ValueError('Variables to load is empty.')
+ return tf.train.Scaffold()
+
+ scaffold_fn = backbone_scaffold_fn
+
+ # Batch norm requires update_ops to be added as a train_op dependency.
+ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+ grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
+ if params['global_gradient_clip_ratio'] > 0:
+ # Clips the gradients for training stability.
+ # Refer: https://arxiv.org/abs/1211.5063
+ with tf.name_scope('clipping'):
+ old_grads, variables = zip(*grads_and_vars)
+ num_weights = sum(
+ g.shape.num_elements() for g in old_grads if g is not None)
+ clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
+ num_weights)
+ logging.info(
+ 'Global clip norm set to %g for %d variables with %d elements.',
+ clip_norm, sum(1 for g in old_grads if g is not None),
+ num_weights)
+ gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
+ else:
+ gradients, variables = zip(*grads_and_vars)
+ grads_and_vars = []
+ # Special treatment for biases (beta is named as bias in reference model)
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113 # pylint: disable=line-too-long
+ for grad, var in zip(gradients, variables):
+ if grad is not None and ('beta' in var.name or 'bias' in var.name):
+ grad = 2.0 * grad
+ grads_and_vars.append((grad, var))
+
+ with tf.control_dependencies(update_ops):
+ train_op = optimizer.apply_gradients(
+ grads_and_vars, global_step=global_step)
+
+ if params['use_host_call']:
+ def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
+ rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
+ fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
+ learning_rate):
+ """Training host call. Creates scalar summaries for training metrics.
+
+ This function is executed on the CPU and should not directly reference
+ any Tensors in the rest of the `model_fn`. To pass Tensors from the
+ model to the `metric_fn`, provide as part of the `host_call`. See
+ https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
+ for more information.
+
+ Arguments should match the list of `Tensor` objects passed as the second
+ element in the tuple passed to `host_call`.
+
+ Args:
+ global_step: `Tensor with shape `[batch, ]` for the global_step.
+ total_loss: `Tensor` with shape `[batch, ]` for the training loss.
+ total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ loss.
+ rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ score loss.
+ rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ box loss.
+ total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN loss.
+ fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN class loss.
+ fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN box loss.
+ mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
+ mask loss.
+ l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
+ regularization loss.
+ learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
+
+ Returns:
+ List of summary ops to run on the CPU host.
+ """
+ # Outfeed supports int32 but global_step is expected to be int64.
+ global_step = tf.reduce_mean(global_step)
+ # Host call fns are executed FLAGS.iterations_per_loop times after one
+ # TPU loop is finished, setting max_queue value to the same as number of
+ # iterations will make the summary writer only flush the data to storage
+ # once per loop.
+ with (tf2.summary.create_file_writer(
+ params['model_dir'],
+ max_queue=params['iterations_per_loop']).as_default()):
+ with tf2.summary.record_if(True):
+ tf2.summary.scalar(
+ 'total_loss', tf.reduce_mean(total_loss), step=global_step)
+ tf2.summary.scalar(
+ 'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
+ tf2.summary.scalar(
+ 'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
+ step=global_step)
+ if params['include_mask']:
+ tf2.summary.scalar(
+ 'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
+ tf2.summary.scalar(
+ 'l2_regularization_loss',
+ tf.reduce_mean(l2_regularization_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'learning_rate', tf.reduce_mean(learning_rate),
+ step=global_step)
+
+ return tf.summary.all_v2_summary_ops()
+
+ # To log the loss, current learning rate, and epoch for Tensorboard, the
+ # summary op needs to be run on the host CPU via host_call. host_call
+ # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+ # dimension. These Tensors are implicitly concatenated to
+ # [params['batch_size']].
+ global_step_t = tf.reshape(global_step, [1])
+ total_loss_t = tf.reshape(total_loss, [1])
+ total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
+ rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
+ rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
+ total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
+ fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
+ fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
+ mask_loss_t = tf.reshape(mask_loss, [1])
+ l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
+ learning_rate_t = tf.reshape(learning_rate, [1])
+ host_call = (host_call_fn,
+ [global_step_t, total_loss_t, total_rpn_loss_t,
+ rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
+ fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
+ mask_loss_t, l2_regularization_loss, learning_rate_t])
+ else:
+ train_op = None
+ scaffold_fn = None
+
+ if params['use_tpu']:
+ return tf.estimator.tpu.TPUEstimatorSpec(
+ mode=mode,
+ loss=total_loss,
+ train_op=train_op,
+ host_call=host_call,
+ scaffold_fn=scaffold_fn)
+ return tf.estimator.EstimatorSpec(
+ mode=mode, loss=total_loss, train_op=train_op)
+
+
+def mask_rcnn_model_fn(features, labels, mode, params):
+ """Mask-RCNN model."""
+ with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+ return _model_fn(
+ features,
+ labels,
+ mode,
+ params,
+ variable_filter_fn=remove_variables)
diff --git a/models/official/mask_rcnn/mask_rcnn_model_new.py b/models/official/mask_rcnn/mask_rcnn_model_new.py
new file mode 100644
index 000000000..f6517f4f9
--- /dev/null
+++ b/models/official/mask_rcnn/mask_rcnn_model_new.py
@@ -0,0 +1,798 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the Mask-RCNN Model.
+
+Defines model_fn of Mask-RCNN for TF Estimator. The model_fn includes Mask-RCNN
+model architecture, loss function, learning rate schedule, and evaluation
+procedure.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import re
+from absl import logging
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf2
+
+import anchors
+import fpn
+import heads
+import learning_rates
+import losses
+import postprocess_ops
+import resnet
+import roi_ops
+import spatial_transform_ops
+import training_ops
+import sys
+
+sys.path.append('tpu/models/official/mnasnet')
+
+
+# import mnasnet_models
+
+
+def create_optimizer(learning_rate, params):
+ """Creates optimized based on the specified flags."""
+ if params['optimizer'] == 'momentum':
+ optimizer = tf.train.MomentumOptimizer(
+ learning_rate, momentum=params['momentum'])
+ elif params['optimizer'] == 'adam':
+ optimizer = tf.train.AdamOptimizer(learning_rate)
+ elif params['optimizer'] == 'adadelta':
+ optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+ elif params['optimizer'] == 'adagrad':
+ optimizer = tf.train.AdagradOptimizer(learning_rate)
+ elif params['optimizer'] == 'rmsprop':
+ optimizer = tf.train.RMSPropOptimizer(
+ learning_rate, momentum=params['momentum'])
+ elif params['optimizer'] == 'lars':
+ try:
+ from tensorflow.contrib.opt import LARSOptimizer # pylint: disable=g-import-not-at-top
+
+ optimizer = LARSOptimizer(
+ learning_rate,
+ momentum=params['momentum'],
+ weight_decay=params['lars_weight_decay'],
+ skip_list=['batch_normalization', 'bias'])
+ except ImportError as e:
+ logging.exception('LARSOptimizer is currently not supported '
+ 'in TensorFlow 2.x.')
+ raise e
+
+ else:
+ raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
+ return optimizer
+
+
+def remove_variables(variables, prefix):
+ """Removes low-level variables from the input.
+
+ Removing low-level parameters (e.g., initial convolution layer) from training
+ usually leads to higher training speed and slightly better testing accuracy.
+ The intuition is that the low-level architecture (e.g., ResNet-50) is able to
+ capture low-level features such as edges; therefore, it does not need to be
+ fine-tuned for the detection task.
+
+ Args:
+ variables: all the variables in training
+ prefix: prefix of backbone
+
+ Returns:
+ var_list: a list containing variables for training
+
+ """
+ # Freeze at conv2 based on reference model.
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194 # pylint: disable=line-too-long
+ remove_list = []
+ remove_list.append(prefix + 'conv2d/')
+ remove_list.append(prefix + 'batch_normalization/')
+ for i in range(1, 11):
+ remove_list.append(prefix + 'conv2d_{}/'.format(i))
+ remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
+
+ def _is_kept(variable):
+ for rm_str in remove_list:
+ if rm_str in variable.name:
+ return False
+ return True
+
+ var_list = [v for v in variables if _is_kept(v)]
+ return var_list
+
+
+def compute_model_statistics(batch_size, is_training=True):
+ """Compute number of parameters and FLOPS."""
+ num_trainable_params = np.sum(
+ [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
+ logging.info('number of trainable params: %d', num_trainable_params)
+
+ options = tf.profiler.ProfileOptionBuilder.float_operation()
+ options['output'] = 'none'
+ flops = tf.profiler.profile(
+ tf.get_default_graph(), options=options).total_float_ops
+ flops_per_image = flops / batch_size
+ if is_training:
+ logging.info(
+ 'number of FLOPS per image: %f in training', flops_per_image)
+ else:
+ logging.info(
+ 'number of FLOPS per image: %f in eval', flops_per_image)
+
+
+def build_box_outputs(mask, params):
+ with tf.tpu.bfloat16_scope():
+ with tf.variable_scope('bbox_head_by_mask'):
+
+ def build_transform_variable(l_r, min_cost, image_size):
+ variable = []
+ if l_r == "right":
+ l_v = min_cost
+ r_v = 0
+ elif l_r == "left":
+ l_v = 0
+ r_v = min_cost
+ else:
+ raise ValueError('l_r must be left or right')
+
+ for i in range(image_size):
+ row = []
+ for k in range(i):
+ row.append(l_v)
+ row.append(1)
+ for k in range(image_size - 1 - i):
+ row.append(r_v)
+ variable.append(row)
+
+ return variable
+
+ def cal_offset(scope, input, img_size, alpha=1e-4):
+ with tf.variable_scope(scope):
+ c_left = tf.constant(build_transform_variable("left", -img_size * 2., img_size), dtype=tf.float32)
+ c_right = tf.constant(build_transform_variable("right", -img_size * 2., img_size), dtype=tf.float32)
+ if params['precision'] == 'bfloat16':
+ c_left = tf.cast(c_left, tf.bfloat16)
+ c_right = tf.cast(c_right, tf.bfloat16)
+ with tf.variable_scope("mask"):
+ if params['precision'] == 'bfloat16':
+ net = tf.cast(tf.greater(input, alpha), tf.bfloat16)
+ else:
+ net = tf.cast(tf.greater(input, alpha), tf.float32)
+
+ # for left
+ net_left = tf.nn.relu(tf.matmul(net, c_left))
+ mask_left = tf.stop_gradient(net_left, name="mask_left")
+
+ # for right
+ net_right = tf.nn.relu(tf.matmul(net, c_right))
+ mask_right = tf.stop_gradient(net_right, name="mask")
+
+ with tf.variable_scope("work"):
+ offset_left = 1 - input
+ left = offset_left + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+ left = tf.reduce_sum(left * mask_left, axis=-1)
+
+ right = input + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+ right = tf.reduce_sum(right * mask_right, axis=-1)
+
+ return left, right, [net_right, input]
+
+ batch_size, num_boxes, img_size, _ = mask.get_shape().as_list()
+ mask = tf.reshape(mask, [-1, img_size, img_size])
+ mask = tf.nn.dropout(mask, keep_prob=0.98)
+ mask = tf.clip_by_value(mask, 0, 1)
+ row = tf.reduce_max(mask, axis=2) # h
+ col = tf.reduce_max(mask, axis=1) # w
+ row_l, row_r, debug_row = cal_offset("cal_offset_row", row, img_size)
+ col_l, col_r, debug_col = cal_offset("cal_offset_col", col, img_size)
+ bbox = tf.stack([row_l, col_l, row_r, col_r], axis=-1)
+ bbox = tf.reshape(bbox, [batch_size, num_boxes, 4])
+
+ return bbox, img_size # y1,x1,y2,x2
+
+
+def build_model_graph(features, labels, is_training, params):
+ """Builds the forward model graph."""
+ use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
+ is_gpu_inference = (not is_training and use_batched_nms)
+ model_outputs = {}
+
+ if is_training and params['transpose_input']:
+ if (params['backbone'].startswith('resnet') and
+ params['conv0_space_to_depth_block_size'] > 0):
+ features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+ else:
+ features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
+
+ batch_size, image_height, image_width, _ = (
+ features['images'].get_shape().as_list())
+
+ conv0_space_to_depth_block_size = 0
+ if (is_training and
+ (params['backbone'].startswith('resnet') and
+ params['conv0_space_to_depth_block_size'] > 0)):
+ conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
+ image_height *= conv0_space_to_depth_block_size
+ image_width *= conv0_space_to_depth_block_size
+
+ if 'source_ids' not in features:
+ features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
+
+ all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
+ params['num_scales'], params['aspect_ratios'],
+ params['anchor_scale'],
+ (image_height, image_width))
+
+ if 'resnet' in params['backbone']:
+ with tf.variable_scope(params['backbone']):
+ resnet_fn = resnet.resnet_v1(
+ params['backbone'],
+ conv0_kernel_size=params['conv0_kernel_size'],
+ conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
+ num_batch_norm_group=params['num_batch_norm_group'])
+ backbone_feats = resnet_fn(
+ features['images'],
+ (params['is_training_bn'] and is_training))
+ elif 'mnasnet' in params['backbone']:
+ with tf.variable_scope(params['backbone']):
+ _, endpoints = mnasnet_models.build_mnasnet_base(
+ features['images'],
+ params['backbone'],
+ training=(params['is_training_bn'] and is_training),
+ override_params={'use_keras': False})
+
+ backbone_feats = {
+ 2: endpoints['reduction_2'],
+ 3: endpoints['reduction_3'],
+ 4: endpoints['reduction_4'],
+ 5: endpoints['reduction_5'],
+ }
+ else:
+ raise ValueError('Not a valid backbone option: %s' % params['backbone'])
+
+ fpn_feats = fpn.fpn(
+ backbone_feats, params['min_level'], params['max_level'])
+ model_outputs.update({
+ 'fpn_features': fpn_feats,
+ })
+
+ rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
+ fpn_feats,
+ params['min_level'], params['max_level'],
+ len(params['aspect_ratios'] * params['num_scales']))
+
+ if is_training:
+ rpn_pre_nms_topn = params['rpn_pre_nms_topn']
+ rpn_post_nms_topn = params['rpn_post_nms_topn']
+ else:
+ rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
+ rpn_post_nms_topn = params['test_rpn_post_nms_topn']
+
+ # rpn_box_rois: [ymin, xmin, ymax, xmax]
+ rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
+ rpn_score_outputs,
+ rpn_box_outputs,
+ all_anchors,
+ features['image_info'],
+ rpn_pre_nms_topn,
+ rpn_post_nms_topn,
+ params['rpn_nms_threshold'],
+ params['rpn_min_size'],
+ bbox_reg_weights=None,
+ use_batched_nms=use_batched_nms)
+ rpn_box_rois = tf.to_float(rpn_box_rois)
+ if is_training:
+ rpn_box_rois = tf.stop_gradient(rpn_box_rois)
+ rpn_box_scores = tf.stop_gradient(rpn_box_scores)
+
+ if is_training:
+ # Sampling
+ box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
+ training_ops.proposal_label_op(
+ rpn_box_rois,
+ labels['gt_boxes'],
+ labels['gt_classes'],
+ batch_size_per_im=params['batch_size_per_im'],
+ fg_fraction=params['fg_fraction'],
+ fg_thresh=params['fg_thresh'],
+ bg_thresh_hi=params['bg_thresh_hi'],
+ bg_thresh_lo=params['bg_thresh_lo']))
+
+ # Performs multi-level RoIAlign.
+ box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+ fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
+
+ class_outputs, _, _ = heads.box_head(
+ box_roi_features, num_classes=params['num_classes'],
+ mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
+
+ if not is_training:
+ if is_gpu_inference:
+ generate_detections_fn = postprocess_ops.generate_detections_gpu
+ else:
+ generate_detections_fn = postprocess_ops.generate_detections_tpu
+ detections = generate_detections_fn(
+ class_outputs,
+ rpn_box_rois,
+ features['image_info'],
+ params['test_rpn_post_nms_topn'],
+ params['test_detections_per_image'],
+ params['test_nms'],
+ params['bbox_reg_weights'])
+
+ model_outputs.update({
+ 'num_detections': detections[0],
+ 'detection_boxes': detections[1],
+ 'detection_classes': detections[2],
+ 'detection_scores': detections[3],
+ })
+ else:
+ # encoded_box_targets = training_ops.encode_box_targets(
+ # rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
+ model_outputs.update({
+ 'rpn_score_outputs': rpn_score_outputs,
+ 'rpn_box_outputs': rpn_box_outputs,
+ 'class_outputs': class_outputs,
+ 'class_targets': class_targets,
+ # 'box_outputs': box_outputs,
+ # 'box_targets': box_targets,
+ # 'box_rois': rpn_box_rois,
+ })
+
+ # # Faster-RCNN mode.
+ # if not params['include_mask']:
+ # # Print #parameters and #FLOPs in model.
+ # compute_model_statistics(batch_size, is_training=is_training)
+ #
+ # return model_outputs
+
+ def expand_boxes(boxes, scale):
+ # whereas `boxes` here is in [y1, x1, y2, x2] form
+ w_half = (boxes[..., 3] - boxes[..., 1]) * .5
+ h_half = (boxes[..., 2] - boxes[..., 0]) * .5
+ x_c = boxes[..., 1] + w_half
+ y_c = boxes[..., 0] + h_half
+
+ w_half *= scale
+ h_half *= scale
+
+ boxes_exp = tf.stack([y_c - h_half, x_c - w_half, y_c + h_half, x_c + w_half], axis=-1)
+
+ return boxes_exp
+
+ # Mask sampling
+ if not is_training:
+ selected_box_rois = model_outputs['detection_boxes']
+ class_indices = model_outputs['detection_classes']
+ # If using GPU for inference, delay the cast until when Gather ops show up
+ # since GPU inference supports float point better.
+ # TODO(laigd): revisit this when newer versions of GPU libraries is
+ # released.
+ selected_box_rois = expand_boxes(selected_box_rois, 1.2)
+ if not is_gpu_inference:
+ class_indices = tf.to_int32(class_indices)
+ else:
+ (selected_class_targets, selected_box_targets, selected_box_rois,
+ proposal_to_label_map) = (
+ training_ops.select_fg_for_masks(
+ class_targets, box_targets, rpn_box_rois,
+ proposal_to_label_map,
+ max_num_fg=int(
+ params['batch_size_per_im'] * params['fg_fraction'])))
+
+ selected_box_rois = expand_boxes(selected_box_targets, 1.2)
+ class_indices = tf.to_int32(selected_class_targets)
+
+
+ import box_utils
+ selected_box_rois = box_utils.clip_boxes(selected_box_rois, 1024, 1024)
+ mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+ fpn_feats,
+ selected_box_rois,
+ output_size=14,
+ is_gpu_inference=is_gpu_inference)
+ mask_outputs = heads.mask_head(
+ mask_roi_features,
+ class_indices,
+ num_classes=params['num_classes'],
+ mrcnn_resolution=params['mrcnn_resolution'],
+ is_gpu_inference=is_gpu_inference)
+
+ soft_mask_outputs = tf.nn.sigmoid(mask_outputs)
+ box_outputs_in_mak, image_size = build_box_outputs(soft_mask_outputs, params)
+ offset = tf.stack(
+ [selected_box_rois[..., 0], selected_box_rois[..., 1], selected_box_rois[..., 0], selected_box_rois[..., 1]],
+ axis=-1)
+ h_scale = (selected_box_rois[..., 2] - selected_box_rois[..., 0]) / image_size
+ w_scale = (selected_box_rois[..., 3] - selected_box_rois[..., 1]) / image_size
+ wh_scale = tf.stack([h_scale, w_scale, h_scale, w_scale], axis=-1)
+ if params['precision'] == 'bfloat16':
+ wh_scale = tf.cast(wh_scale, tf.bfloat16)
+ offset = tf.cast(offset, tf.bfloat16)
+
+ box_outputs = offset + box_outputs_in_mak * wh_scale
+
+ # Print #parameters and #FLOPs in model.
+ compute_model_statistics(batch_size, is_training=is_training)
+
+ if is_training:
+ mask_targets = training_ops.get_mask_targets(
+ selected_box_rois, proposal_to_label_map, selected_box_targets,
+ labels['cropped_gt_masks'], params['mrcnn_resolution'])
+ model_outputs.update({
+ 'mask_outputs': mask_outputs,
+ 'mask_targets': mask_targets,
+ 'selected_class_targets': selected_class_targets,
+ 'box_outputs': box_outputs,
+ 'box_targets': selected_box_targets,
+ })
+ else:
+ model_outputs.update({
+ 'detection_masks': soft_mask_outputs,
+ 'detection_boxes': box_outputs,
+ 'selected_box_rois': selected_box_rois
+ })
+
+ return model_outputs
+
+
+def _build_assigment_map(optimizer, prefix=None, skip_variables_regex=None):
+ """Generate assigment map for loading checkpoints."""
+ optimizer_vars = set([var.name for var in optimizer.variables()])
+ all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
+ if not prefix:
+ prefix = ''
+ assignment_map = {}
+ for var in all_vars:
+ if var.name not in optimizer_vars:
+ var_name = var.name
+ # Trim the index of the variable.
+ if ':' in var_name:
+ var_name = var_name[:var_name.rindex(':')]
+ if skip_variables_regex and re.match(skip_variables_regex,
+ var_name[len(prefix):]):
+ continue
+ assignment_map[var_name[len(prefix):]] = var
+ return assignment_map
+
+
+def _model_fn(features, labels, mode, params, variable_filter_fn=None):
+ """Model defination for the Mask-RCNN model based on ResNet.
+
+ Args:
+ features: the input image tensor and auxiliary information, such as
+ `image_info` and `source_ids`. The image tensor has a shape of
+ [batch_size, height, width, 3]. The height and width are fixed and equal.
+ labels: the input labels in a dictionary. The labels include score targets
+ and box targets which are dense label maps. The labels are generated from
+ get_input_fn function in data/dataloader.py
+ mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+ params: the dictionary defines hyperparameters of model. The default
+ settings are in default_hparams function in this file.
+ variable_filter_fn: the filter function that takes trainable_variables and
+ returns the variable list after applying the filter rule.
+
+ Returns:
+ tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
+ """
+ if (mode == tf.estimator.ModeKeys.PREDICT or
+ mode == tf.estimator.ModeKeys.EVAL):
+ if ((params['include_groundtruth_in_features'] or
+ mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
+ # In include groundtruth for eval.
+ labels = features['labels']
+
+ if 'features' in features:
+ features = features['features']
+ # Otherwise, it is in export mode, the features is past in directly.
+
+ if params['precision'] == 'bfloat16':
+ with tf.tpu.bfloat16_scope():
+ model_outputs = build_model_graph(features, labels,
+ mode == tf.estimator.ModeKeys.TRAIN,
+ params)
+ model_outputs.update({
+ 'source_id': features['source_ids'],
+ 'image_info': features['image_info'],
+ })
+
+ def cast_outputs_to_float(d):
+ for k, v in sorted(six.iteritems(d)):
+ if isinstance(v, dict):
+ cast_outputs_to_float(v)
+ else:
+ d[k] = tf.cast(v, tf.float32)
+
+ cast_outputs_to_float(model_outputs)
+ else:
+ model_outputs = build_model_graph(features, labels,
+ mode == tf.estimator.ModeKeys.TRAIN,
+ params)
+ model_outputs.update({
+ 'source_id': features['source_ids'],
+ 'image_info': features['image_info'],
+ })
+
+ # First check if it is in PREDICT or EVAL mode to fill out predictions.
+ # Predictions are used during the eval step to generate metrics.
+ predictions = {}
+ if (mode == tf.estimator.ModeKeys.PREDICT or
+ mode == tf.estimator.ModeKeys.EVAL):
+ if 'orig_images' in features:
+ model_outputs['orig_images'] = features['orig_images']
+ if labels and params['include_groundtruth_in_features']:
+ # Labels can only be embedded in predictions. The predition cannot output
+ # dictionary as a value.
+ predictions.update(labels)
+ model_outputs.pop('fpn_features', None)
+ predictions.update(model_outputs)
+ # If we are doing PREDICT, we can return here.
+ if mode == tf.estimator.ModeKeys.PREDICT:
+ if params['use_tpu']:
+ return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
+ predictions=predictions)
+ return tf.estimator.EstimatorSpec(mode=mode,
+ predictions=predictions)
+
+ # Set up training loss and learning rate.
+ global_step = tf.train.get_or_create_global_step()
+ if params['learning_rate_type'] == 'step':
+ learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
+ global_step,
+ params['init_learning_rate'],
+ params['warmup_learning_rate'],
+ params['warmup_steps'],
+ params['learning_rate_levels'],
+ params['learning_rate_steps'])
+ elif params['learning_rate_type'] == 'cosine':
+ learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
+ global_step,
+ params['init_learning_rate'],
+ params['warmup_learning_rate'],
+ params['warmup_steps'],
+ params['total_steps'])
+ else:
+ raise ValueError('Unsupported learning rate type: `{}`!'
+ .format(params['learning_rate_type']))
+ # score_loss and box_loss are for logging. only total_loss is optimized.
+ total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
+ model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
+ labels, params)
+
+ (total_fast_rcnn_loss, fast_rcnn_class_loss,
+ fast_rcnn_box_loss) = losses.fast_rcnn_loss(
+ model_outputs['class_outputs'], model_outputs['box_outputs'],
+ model_outputs['class_targets'], model_outputs['box_targets'], model_outputs['selected_class_targets'], params)
+ # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py # pylint: disable=line-too-long
+ if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
+ mask_loss = losses.mask_rcnn_loss(
+ model_outputs['mask_outputs'], model_outputs['mask_targets'],
+ model_outputs['selected_class_targets'], params)
+ else:
+ mask_loss = 0.
+ if variable_filter_fn and ('resnet' in params['backbone']):
+ var_list = variable_filter_fn(tf.trainable_variables(),
+ params['backbone'] + '/')
+ else:
+ var_list = tf.trainable_variables()
+ l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
+ tf.nn.l2_loss(v)
+ for v in var_list
+ if 'batch_normalization' not in v.name and 'bias' not in v.name
+ ])
+ total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss)
+
+ host_call = None
+ if mode == tf.estimator.ModeKeys.TRAIN:
+ optimizer = create_optimizer(learning_rate, params)
+ if params['use_tpu']:
+ optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+
+ scaffold_fn = None
+ if params['warm_start_path']:
+
+ def warm_start_scaffold_fn():
+ logging.info(
+ 'model_fn warm start from: %s,', params['warm_start_path'])
+ assignment_map = _build_assigment_map(
+ optimizer,
+ prefix=None,
+ skip_variables_regex=params['skip_checkpoint_variables'])
+ tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
+ return tf.train.Scaffold()
+
+ scaffold_fn = warm_start_scaffold_fn
+
+ elif params['checkpoint']:
+
+ def backbone_scaffold_fn():
+ """Loads pretrained model through scaffold function."""
+ # Exclude all variable of optimizer.
+ vars_to_load = _build_assigment_map(
+ optimizer,
+ prefix=params['backbone'] + '/',
+ skip_variables_regex=params['skip_checkpoint_variables'])
+ tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
+ if not vars_to_load:
+ raise ValueError('Variables to load is empty.')
+ return tf.train.Scaffold()
+
+ scaffold_fn = backbone_scaffold_fn
+
+ # Batch norm requires update_ops to be added as a train_op dependency.
+ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+ grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
+ if params['global_gradient_clip_ratio'] > 0:
+ # Clips the gradients for training stability.
+ # Refer: https://arxiv.org/abs/1211.5063
+ with tf.name_scope('clipping'):
+ old_grads, variables = zip(*grads_and_vars)
+ num_weights = sum(
+ g.shape.num_elements() for g in old_grads if g is not None)
+ clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
+ num_weights)
+ logging.info(
+ 'Global clip norm set to %g for %d variables with %d elements.',
+ clip_norm, sum(1 for g in old_grads if g is not None),
+ num_weights)
+ gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
+ else:
+ gradients, variables = zip(*grads_and_vars)
+ grads_and_vars = []
+ # Special treatment for biases (beta is named as bias in reference model)
+ # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113 # pylint: disable=line-too-long
+ for grad, var in zip(gradients, variables):
+ if grad is not None and ('beta' in var.name or 'bias' in var.name):
+ grad = 2.0 * grad
+ grads_and_vars.append((grad, var))
+
+ with tf.control_dependencies(update_ops):
+ train_op = optimizer.apply_gradients(
+ grads_and_vars, global_step=global_step)
+
+ if params['use_host_call']:
+ def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
+ rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
+ fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
+ learning_rate):
+ """Training host call. Creates scalar summaries for training metrics.
+
+ This function is executed on the CPU and should not directly reference
+ any Tensors in the rest of the `model_fn`. To pass Tensors from the
+ model to the `metric_fn`, provide as part of the `host_call`. See
+ https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
+ for more information.
+
+ Arguments should match the list of `Tensor` objects passed as the second
+ element in the tuple passed to `host_call`.
+
+ Args:
+ global_step: `Tensor with shape `[batch, ]` for the global_step.
+ total_loss: `Tensor` with shape `[batch, ]` for the training loss.
+ total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ loss.
+ rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ score loss.
+ rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
+ box loss.
+ total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN loss.
+ fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN class loss.
+ fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
+ training Mask-RCNN box loss.
+ mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
+ mask loss.
+ l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
+ regularization loss.
+ learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
+
+ Returns:
+ List of summary ops to run on the CPU host.
+ """
+ # Outfeed supports int32 but global_step is expected to be int64.
+ global_step = tf.reduce_mean(global_step)
+ # Host call fns are executed FLAGS.iterations_per_loop times after one
+ # TPU loop is finished, setting max_queue value to the same as number of
+ # iterations will make the summary writer only flush the data to storage
+ # once per loop.
+ with (tf2.summary.create_file_writer(
+ params['model_dir'],
+ max_queue=params['iterations_per_loop']).as_default()):
+ with tf2.summary.record_if(True):
+ tf2.summary.scalar(
+ 'total_loss', tf.reduce_mean(total_loss), step=global_step)
+ tf2.summary.scalar(
+ 'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
+ tf2.summary.scalar(
+ 'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
+ step=global_step)
+ if params['include_mask']:
+ tf2.summary.scalar(
+ 'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
+ tf2.summary.scalar(
+ 'l2_regularization_loss',
+ tf.reduce_mean(l2_regularization_loss),
+ step=global_step)
+ tf2.summary.scalar(
+ 'learning_rate', tf.reduce_mean(learning_rate),
+ step=global_step)
+
+ return tf.summary.all_v2_summary_ops()
+
+ # To log the loss, current learning rate, and epoch for Tensorboard, the
+ # summary op needs to be run on the host CPU via host_call. host_call
+ # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+ # dimension. These Tensors are implicitly concatenated to
+ # [params['batch_size']].
+ global_step_t = tf.reshape(global_step, [1])
+ total_loss_t = tf.reshape(total_loss, [1])
+ total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
+ rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
+ rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
+ total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
+ fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
+ fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
+ mask_loss_t = tf.reshape(mask_loss, [1])
+ l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
+ learning_rate_t = tf.reshape(learning_rate, [1])
+ host_call = (host_call_fn,
+ [global_step_t, total_loss_t, total_rpn_loss_t,
+ rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
+ fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
+ mask_loss_t, l2_regularization_loss, learning_rate_t])
+ else:
+ train_op = None
+ scaffold_fn = None
+
+ if params['use_tpu']:
+ return tf.estimator.tpu.TPUEstimatorSpec(
+ mode=mode,
+ loss=total_loss,
+ train_op=train_op,
+ host_call=host_call,
+ scaffold_fn=scaffold_fn)
+ return tf.estimator.EstimatorSpec(
+ mode=mode, loss=total_loss, train_op=train_op)
+
+
+def mask_rcnn_model_fn(features, labels, mode, params):
+ """Mask-RCNN model."""
+ with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+ return _model_fn(
+ features,
+ labels,
+ mode,
+ params,
+ variable_filter_fn=remove_variables)
diff --git a/models/official/mask_rcnn/postprocess_ops.py b/models/official/mask_rcnn/postprocess_ops.py
index ecabb8b08..4a829ce68 100644
--- a/models/official/mask_rcnn/postprocess_ops.py
+++ b/models/official/mask_rcnn/postprocess_ops.py
@@ -24,7 +24,6 @@
def generate_detections_per_image_tpu(cls_outputs,
- box_outputs,
anchor_boxes,
image_info,
pre_nms_num_detections=1000,
@@ -70,18 +69,20 @@ def generate_detections_per_image_tpu(cls_outputs,
top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)
anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
- box_outputs = tf.reshape(
- box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
+ boxes = anchor_boxes
+ # box_outputs = tf.reshape(
+ # box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
class_indices = classes
- box_outputs = tf.gather_nd(box_outputs,
- tf.stack([top_k_indices, class_indices], axis=1))
+ # box_outputs = tf.gather_nd(box_outputs,
+ # tf.stack([top_k_indices, class_indices], axis=1))
# apply bounding box regression to anchors
- boxes = box_utils.decode_boxes(
- box_outputs, anchor_boxes, bbox_reg_weights)
+ # boxes = box_utils.decode_boxes(
+ # box_outputs, anchor_boxes, bbox_reg_weights)
boxes = box_utils.clip_boxes(
boxes, image_info[0], image_info[1])
+
list_of_all_boxes = []
list_of_all_scores = []
list_of_all_classes = []
@@ -134,7 +135,6 @@ def generate_detections_per_image_tpu(cls_outputs,
def generate_detections_tpu(class_outputs,
- box_outputs,
anchor_boxes,
image_info,
pre_nms_num_detections=1000,
@@ -175,7 +175,7 @@ def generate_detections_tpu(class_outputs,
num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])
for i in range(batch_size):
result = generate_detections_per_image_tpu(
- softmax_class_outputs[i], box_outputs[i], anchor_boxes[i],
+ softmax_class_outputs[i], anchor_boxes[i],
image_info[i], pre_nms_num_detections, post_nms_num_detections,
nms_threshold, bbox_reg_weights)
@@ -192,7 +192,6 @@ def generate_detections_tpu(class_outputs,
def generate_detections_gpu(class_outputs,
- box_outputs,
anchor_boxes,
image_info,
pre_nms_num_detections=1000,
@@ -232,22 +231,24 @@ def generate_detections_gpu(class_outputs,
# Remove background
scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
- boxes = tf.slice(
- tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
- [0, 0, 1, 0], [-1, -1, -1, -1])
+ # boxes = tf.slice(
+ # tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
+ # [0, 0, 1, 0], [-1, -1, -1, -1])
anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
tf.ones([1, 1, num_classes - 1, 1]))
num_detections = num_boxes * (num_classes - 1)
- boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
+ # boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
scores = tf.reshape(scores, [batch_size, num_detections, 1])
anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
# Decode
- boxes = box_utils.decode_boxes(
- boxes, anchor_boxes, bbox_reg_weights)
+ # boxes = box_utils.decode_boxes(
+ # boxes, anchor_boxes, bbox_reg_weights)
+
+ boxes = anchor_boxes
# Clip boxes
height = tf.expand_dims(image_info[:, 0:1], axis=-1)
diff --git a/models/official/mask_rcnn/postprocess_ops_back.py b/models/official/mask_rcnn/postprocess_ops_back.py
new file mode 100644
index 000000000..ecabb8b08
--- /dev/null
+++ b/models/official/mask_rcnn/postprocess_ops_back.py
@@ -0,0 +1,278 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops used to post-process raw detections."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+import box_utils
+
+
+def generate_detections_per_image_tpu(cls_outputs,
+ box_outputs,
+ anchor_boxes,
+ image_info,
+ pre_nms_num_detections=1000,
+ post_nms_num_detections=100,
+ nms_threshold=0.3,
+ bbox_reg_weights=(10., 10., 5., 5.)):
+ """Generate the final detections per image given the model outputs.
+
+ Args:
+ cls_outputs: a tensor with shape [N, num_classes], which stacks class
+ logit outputs on all feature levels. The N is the number of total anchors
+ on all levels. The num_classes is the number of classes predicted by the
+ model. Note that the cls_outputs should be the output of softmax().
+ box_outputs: a tensor with shape [N, num_classes*4], which stacks box
+ regression outputs on all feature levels. The N is the number of total
+ anchors on all levels.
+ anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
+ feature levels. The N is the number of total anchors on all levels.
+ image_info: a tensor of shape [5] which encodes the input image's [height,
+ width, scale, original_height, original_width]
+ pre_nms_num_detections: an integer that specifies the number of candidates
+ before NMS.
+ post_nms_num_detections: an integer that specifies the number of candidates
+ after NMS.
+ nms_threshold: a float number to specify the IOU threshold of NMS.
+ bbox_reg_weights: a list of 4 float scalars, which are default weights on
+ (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+ Returns:
+ detections: Tuple of tensors corresponding to number of valid boxes,
+ box coordinates, object categories for each boxes, and box scores
+ -- respectively.
+ """
+ num_boxes, num_classes = cls_outputs.get_shape().as_list()
+
+ # Remove background class scores.
+ cls_outputs = cls_outputs[:, 1:num_classes]
+ top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
+ tf.reshape(cls_outputs, [-1]),
+ k=pre_nms_num_detections,
+ sorted=False)
+ classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
+ top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)
+
+ anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
+ box_outputs = tf.reshape(
+ box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
+ class_indices = classes
+ box_outputs = tf.gather_nd(box_outputs,
+ tf.stack([top_k_indices, class_indices], axis=1))
+
+ # apply bounding box regression to anchors
+ boxes = box_utils.decode_boxes(
+ box_outputs, anchor_boxes, bbox_reg_weights)
+ boxes = box_utils.clip_boxes(
+ boxes, image_info[0], image_info[1])
+
+ list_of_all_boxes = []
+ list_of_all_scores = []
+ list_of_all_classes = []
+ # Skip background class.
+ for class_i in range(num_classes):
+ # Compute bitmask for the given classes.
+ class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype)
+ # This works because score is in [0, 1].
+ class_i_scores = top_k_scores * class_i_bitmask
+ # The TPU and CPU have different behaviors for
+ # tf.image.non_max_suppression_padded (b/116754376).
+ (class_i_post_nms_indices,
+ class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
+ tf.to_float(boxes),
+ tf.to_float(class_i_scores),
+ post_nms_num_detections,
+ iou_threshold=nms_threshold,
+ score_threshold=0.05,
+ pad_to_max_output_size=True,
+ name='nms_detections_' + str(class_i))
+ class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
+ class_i_post_nms_scores = tf.gather(class_i_scores,
+ class_i_post_nms_indices)
+ mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid])
+ class_i_post_nms_scores = tf.where(
+ mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores))
+ class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i+1)
+ list_of_all_boxes.append(class_i_post_nms_boxes)
+ list_of_all_scores.append(class_i_post_nms_scores)
+ list_of_all_classes.append(class_i_classes)
+
+ post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
+ post_nms_scores = tf.concat(list_of_all_scores, axis=0)
+ post_nms_classes = tf.concat(list_of_all_classes, axis=0)
+
+ # sort all results.
+ post_nms_scores, sorted_indices = tf.nn.top_k(
+ tf.to_float(post_nms_scores),
+ k=post_nms_num_detections,
+ sorted=True)
+ post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
+ post_nms_classes = tf.gather(post_nms_classes, sorted_indices)
+
+ valid_mask = tf.where(
+ tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores),
+ tf.zeros_like(post_nms_scores))
+ num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
+ box_classes = tf.to_float(post_nms_classes)
+ return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
+
+
+def generate_detections_tpu(class_outputs,
+ box_outputs,
+ anchor_boxes,
+ image_info,
+ pre_nms_num_detections=1000,
+ post_nms_num_detections=100,
+ nms_threshold=0.3,
+ bbox_reg_weights=(10., 10., 5., 5.)):
+ """Generate the final detections given the model outputs (TPU version).
+
+ Args:
+ class_outputs: a tensor with shape [batch_size, N, num_classes], which
+ stacks class logit outputs on all feature levels. The N is the number of
+ total anchors on all levels. The num_classes is the number of classes
+ predicted by the model. Note that the class_outputs here is the raw score.
+ box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+ stacks box regression outputs on all feature levels. The N is the number
+ of total anchors on all levels.
+ anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+ on all feature levels. The N is the number of total anchors on all levels.
+ image_info: a tensor of shape [batch_size, 5] which encodes each image's
+ [height, width, scale, original_height, original_width].
+ pre_nms_num_detections: an integer that specifies the number of candidates
+ before NMS.
+ post_nms_num_detections: an integer that specifies the number of candidates
+ after NMS.
+ nms_threshold: a float number to specify the IOU threshold of NMS.
+ bbox_reg_weights: a list of 4 float scalars, which are default weights on
+ (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+ Returns:
+ a tuple of tensors corresponding to number of valid boxes,
+ box coordinates, object categories for each boxes, and box scores stacked
+ in batch_size.
+ """
+ with tf.name_scope('generate_detections'):
+ batch_size, _, _ = class_outputs.get_shape().as_list()
+ softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+ num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])
+ for i in range(batch_size):
+ result = generate_detections_per_image_tpu(
+ softmax_class_outputs[i], box_outputs[i], anchor_boxes[i],
+ image_info[i], pre_nms_num_detections, post_nms_num_detections,
+ nms_threshold, bbox_reg_weights)
+
+ num_valid_boxes.append(result[0])
+ box_coordinates.append(result[1])
+ box_classes.append(result[2])
+ box_scores.append(result[3])
+
+ num_valid_boxes = tf.stack(num_valid_boxes)
+ box_coordinates = tf.stack(box_coordinates)
+ box_classes = tf.stack(box_classes)
+ box_scores = tf.stack(box_scores)
+ return num_valid_boxes, box_coordinates, box_classes, box_scores
+
+
+def generate_detections_gpu(class_outputs,
+ box_outputs,
+ anchor_boxes,
+ image_info,
+ pre_nms_num_detections=1000,
+ post_nms_num_detections=100,
+ nms_threshold=0.3,
+ bbox_reg_weights=(10., 10., 5., 5.)):
+ """Generate the final detections given the model outputs (GPU version).
+
+ Args:
+ class_outputs: a tensor with shape [batch_size, N, num_classes], which
+ stacks class logit outputs on all feature levels. The N is the number of
+ total anchors on all levels. The num_classes is the number of classes
+ predicted by the model. Note that the class_outputs here is the raw score.
+ box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+ stacks box regression outputs on all feature levels. The N is the number
+ of total anchors on all levels.
+ anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+ on all feature levels. The N is the number of total anchors on all levels.
+ image_info: a tensor of shape [batch_size, 5] which encodes each image's
+ [height, width, scale, original_height, original_width].
+ pre_nms_num_detections: an integer that specifies the number of candidates
+ before NMS.
+ post_nms_num_detections: an integer that specifies the number of candidates
+ after NMS.
+ nms_threshold: a float number to specify the IOU threshold of NMS.
+ bbox_reg_weights: a list of 4 float scalars, which are default weights on
+ (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+ Returns:
+ a tuple of tensors corresponding to number of valid boxes,
+ box coordinates, object categories for each boxes, and box scores stacked
+ in batch_size.
+ """
+ with tf.name_scope('generate_detections'):
+ batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
+ softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+ # Remove background
+ scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
+ boxes = tf.slice(
+ tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
+ [0, 0, 1, 0], [-1, -1, -1, -1])
+
+ anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
+ tf.ones([1, 1, num_classes - 1, 1]))
+
+ num_detections = num_boxes * (num_classes - 1)
+
+ boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
+ scores = tf.reshape(scores, [batch_size, num_detections, 1])
+ anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
+
+ # Decode
+ boxes = box_utils.decode_boxes(
+ boxes, anchor_boxes, bbox_reg_weights)
+
+ # Clip boxes
+ height = tf.expand_dims(image_info[:, 0:1], axis=-1)
+ width = tf.expand_dims(image_info[:, 1:2], axis=-1)
+ boxes = box_utils.clip_boxes(boxes, height, width)
+
+ # NMS
+ pre_nms_boxes = box_utils.to_normalized_coordinates(
+ boxes, height, width)
+ pre_nms_boxes = tf.reshape(
+ pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4])
+ pre_nms_scores = tf.reshape(
+ scores, [batch_size, num_boxes, num_classes - 1])
+ (post_nms_boxes, post_nms_scores, post_nms_classes,
+ post_nms_num_valid_boxes) = (
+ tf.image.combined_non_max_suppression(
+ pre_nms_boxes,
+ pre_nms_scores,
+ max_output_size_per_class=pre_nms_num_detections,
+ max_total_size=post_nms_num_detections,
+ iou_threshold=nms_threshold,
+ score_threshold=0.0,
+ pad_per_class=False))
+ post_nms_classes = post_nms_classes + 1
+ post_nms_boxes = box_utils.to_absolute_coordinates(
+ post_nms_boxes, height, width)
+ return (post_nms_num_valid_boxes, post_nms_boxes,
+ tf.to_float(post_nms_classes), post_nms_scores)
diff --git a/models/official/mask_rcnn/postprocess_ops_new.py b/models/official/mask_rcnn/postprocess_ops_new.py
new file mode 100644
index 000000000..4a829ce68
--- /dev/null
+++ b/models/official/mask_rcnn/postprocess_ops_new.py
@@ -0,0 +1,279 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops used to post-process raw detections."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+import box_utils
+
+
+def generate_detections_per_image_tpu(cls_outputs,
+ anchor_boxes,
+ image_info,
+ pre_nms_num_detections=1000,
+ post_nms_num_detections=100,
+ nms_threshold=0.3,
+ bbox_reg_weights=(10., 10., 5., 5.)):
+ """Generate the final detections per image given the model outputs.
+
+ Args:
+ cls_outputs: a tensor with shape [N, num_classes], which stacks class
+ logit outputs on all feature levels. The N is the number of total anchors
+ on all levels. The num_classes is the number of classes predicted by the
+ model. Note that the cls_outputs should be the output of softmax().
+ box_outputs: a tensor with shape [N, num_classes*4], which stacks box
+ regression outputs on all feature levels. The N is the number of total
+ anchors on all levels.
+ anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
+ feature levels. The N is the number of total anchors on all levels.
+ image_info: a tensor of shape [5] which encodes the input image's [height,
+ width, scale, original_height, original_width]
+ pre_nms_num_detections: an integer that specifies the number of candidates
+ before NMS.
+ post_nms_num_detections: an integer that specifies the number of candidates
+ after NMS.
+ nms_threshold: a float number to specify the IOU threshold of NMS.
+ bbox_reg_weights: a list of 4 float scalars, which are default weights on
+ (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+ Returns:
+ detections: Tuple of tensors corresponding to number of valid boxes,
+ box coordinates, object categories for each boxes, and box scores
+ -- respectively.
+ """
+ num_boxes, num_classes = cls_outputs.get_shape().as_list()
+
+ # Remove background class scores.
+ cls_outputs = cls_outputs[:, 1:num_classes]
+ top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
+ tf.reshape(cls_outputs, [-1]),
+ k=pre_nms_num_detections,
+ sorted=False)
+ classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
+ top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)
+
+ anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
+ boxes = anchor_boxes
+ # box_outputs = tf.reshape(
+ # box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
+ class_indices = classes
+ # box_outputs = tf.gather_nd(box_outputs,
+ # tf.stack([top_k_indices, class_indices], axis=1))
+
+ # apply bounding box regression to anchors
+ # boxes = box_utils.decode_boxes(
+ # box_outputs, anchor_boxes, bbox_reg_weights)
+ boxes = box_utils.clip_boxes(
+ boxes, image_info[0], image_info[1])
+
+
+ list_of_all_boxes = []
+ list_of_all_scores = []
+ list_of_all_classes = []
+ # Skip background class.
+ for class_i in range(num_classes):
+ # Compute bitmask for the given classes.
+ class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype)
+ # This works because score is in [0, 1].
+ class_i_scores = top_k_scores * class_i_bitmask
+ # The TPU and CPU have different behaviors for
+ # tf.image.non_max_suppression_padded (b/116754376).
+ (class_i_post_nms_indices,
+ class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
+ tf.to_float(boxes),
+ tf.to_float(class_i_scores),
+ post_nms_num_detections,
+ iou_threshold=nms_threshold,
+ score_threshold=0.05,
+ pad_to_max_output_size=True,
+ name='nms_detections_' + str(class_i))
+ class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
+ class_i_post_nms_scores = tf.gather(class_i_scores,
+ class_i_post_nms_indices)
+ mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid])
+ class_i_post_nms_scores = tf.where(
+ mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores))
+ class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i+1)
+ list_of_all_boxes.append(class_i_post_nms_boxes)
+ list_of_all_scores.append(class_i_post_nms_scores)
+ list_of_all_classes.append(class_i_classes)
+
+ post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
+ post_nms_scores = tf.concat(list_of_all_scores, axis=0)
+ post_nms_classes = tf.concat(list_of_all_classes, axis=0)
+
+ # sort all results.
+ post_nms_scores, sorted_indices = tf.nn.top_k(
+ tf.to_float(post_nms_scores),
+ k=post_nms_num_detections,
+ sorted=True)
+ post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
+ post_nms_classes = tf.gather(post_nms_classes, sorted_indices)
+
+ valid_mask = tf.where(
+ tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores),
+ tf.zeros_like(post_nms_scores))
+ num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
+ box_classes = tf.to_float(post_nms_classes)
+ return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
+
+
+def generate_detections_tpu(class_outputs,
+ anchor_boxes,
+ image_info,
+ pre_nms_num_detections=1000,
+ post_nms_num_detections=100,
+ nms_threshold=0.3,
+ bbox_reg_weights=(10., 10., 5., 5.)):
+ """Generate the final detections given the model outputs (TPU version).
+
+ Args:
+ class_outputs: a tensor with shape [batch_size, N, num_classes], which
+ stacks class logit outputs on all feature levels. The N is the number of
+ total anchors on all levels. The num_classes is the number of classes
+ predicted by the model. Note that the class_outputs here is the raw score.
+ box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+ stacks box regression outputs on all feature levels. The N is the number
+ of total anchors on all levels.
+ anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+ on all feature levels. The N is the number of total anchors on all levels.
+ image_info: a tensor of shape [batch_size, 5] which encodes each image's
+ [height, width, scale, original_height, original_width].
+ pre_nms_num_detections: an integer that specifies the number of candidates
+ before NMS.
+ post_nms_num_detections: an integer that specifies the number of candidates
+ after NMS.
+ nms_threshold: a float number to specify the IOU threshold of NMS.
+ bbox_reg_weights: a list of 4 float scalars, which are default weights on
+ (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+ Returns:
+ a tuple of tensors corresponding to number of valid boxes,
+ box coordinates, object categories for each boxes, and box scores stacked
+ in batch_size.
+ """
+ with tf.name_scope('generate_detections'):
+ batch_size, _, _ = class_outputs.get_shape().as_list()
+ softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+ num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])
+ for i in range(batch_size):
+ result = generate_detections_per_image_tpu(
+ softmax_class_outputs[i], anchor_boxes[i],
+ image_info[i], pre_nms_num_detections, post_nms_num_detections,
+ nms_threshold, bbox_reg_weights)
+
+ num_valid_boxes.append(result[0])
+ box_coordinates.append(result[1])
+ box_classes.append(result[2])
+ box_scores.append(result[3])
+
+ num_valid_boxes = tf.stack(num_valid_boxes)
+ box_coordinates = tf.stack(box_coordinates)
+ box_classes = tf.stack(box_classes)
+ box_scores = tf.stack(box_scores)
+ return num_valid_boxes, box_coordinates, box_classes, box_scores
+
+
+def generate_detections_gpu(class_outputs,
+ anchor_boxes,
+ image_info,
+ pre_nms_num_detections=1000,
+ post_nms_num_detections=100,
+ nms_threshold=0.3,
+ bbox_reg_weights=(10., 10., 5., 5.)):
+ """Generate the final detections given the model outputs (GPU version).
+
+ Args:
+ class_outputs: a tensor with shape [batch_size, N, num_classes], which
+ stacks class logit outputs on all feature levels. The N is the number of
+ total anchors on all levels. The num_classes is the number of classes
+ predicted by the model. Note that the class_outputs here is the raw score.
+ box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+ stacks box regression outputs on all feature levels. The N is the number
+ of total anchors on all levels.
+ anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+ on all feature levels. The N is the number of total anchors on all levels.
+ image_info: a tensor of shape [batch_size, 5] which encodes each image's
+ [height, width, scale, original_height, original_width].
+ pre_nms_num_detections: an integer that specifies the number of candidates
+ before NMS.
+ post_nms_num_detections: an integer that specifies the number of candidates
+ after NMS.
+ nms_threshold: a float number to specify the IOU threshold of NMS.
+ bbox_reg_weights: a list of 4 float scalars, which are default weights on
+ (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+ Returns:
+ a tuple of tensors corresponding to number of valid boxes,
+ box coordinates, object categories for each boxes, and box scores stacked
+ in batch_size.
+ """
+ with tf.name_scope('generate_detections'):
+ batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
+ softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+ # Remove background
+ scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
+ # boxes = tf.slice(
+ # tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
+ # [0, 0, 1, 0], [-1, -1, -1, -1])
+
+ anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
+ tf.ones([1, 1, num_classes - 1, 1]))
+
+ num_detections = num_boxes * (num_classes - 1)
+
+ # boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
+ scores = tf.reshape(scores, [batch_size, num_detections, 1])
+ anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
+
+ # Decode
+ # boxes = box_utils.decode_boxes(
+ # boxes, anchor_boxes, bbox_reg_weights)
+
+ boxes = anchor_boxes
+
+ # Clip boxes
+ height = tf.expand_dims(image_info[:, 0:1], axis=-1)
+ width = tf.expand_dims(image_info[:, 1:2], axis=-1)
+ boxes = box_utils.clip_boxes(boxes, height, width)
+
+ # NMS
+ pre_nms_boxes = box_utils.to_normalized_coordinates(
+ boxes, height, width)
+ pre_nms_boxes = tf.reshape(
+ pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4])
+ pre_nms_scores = tf.reshape(
+ scores, [batch_size, num_boxes, num_classes - 1])
+ (post_nms_boxes, post_nms_scores, post_nms_classes,
+ post_nms_num_valid_boxes) = (
+ tf.image.combined_non_max_suppression(
+ pre_nms_boxes,
+ pre_nms_scores,
+ max_output_size_per_class=pre_nms_num_detections,
+ max_total_size=post_nms_num_detections,
+ iou_threshold=nms_threshold,
+ score_threshold=0.0,
+ pad_per_class=False))
+ post_nms_classes = post_nms_classes + 1
+ post_nms_boxes = box_utils.to_absolute_coordinates(
+ post_nms_boxes, height, width)
+ return (post_nms_num_valid_boxes, post_nms_boxes,
+ tf.to_float(post_nms_classes), post_nms_scores)