diff --git a/models/official/mask_rcnn/.idea/mask_rcnn.iml b/models/official/mask_rcnn/.idea/mask_rcnn.iml
new file mode 100644
index 000000000..6f63a63cc
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/mask_rcnn.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="Nosetests" />
+    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/misc.xml b/models/official/mask_rcnn/.idea/misc.xml
new file mode 100644
index 000000000..399908725
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/modules.xml b/models/official/mask_rcnn/.idea/modules.xml
new file mode 100644
index 000000000..958919a4a
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/mask_rcnn.iml" filepath="$PROJECT_DIR$/.idea/mask_rcnn.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/vcs.xml b/models/official/mask_rcnn/.idea/vcs.xml
new file mode 100644
index 000000000..c2365ab11
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/models/official/mask_rcnn/.idea/workspace.xml b/models/official/mask_rcnn/.idea/workspace.xml
new file mode 100644
index 000000000..f8e5a738a
--- /dev/null
+++ b/models/official/mask_rcnn/.idea/workspace.xml
@@ -0,0 +1,358 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="73a3e9cc-360a-42e4-b4b7-5598e240f823" name="Default Changelist" comment="">
+      <change afterPath="$PROJECT_DIR$/.idea/mask_rcnn.iml" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/.idea/misc.xml" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/.idea/modules.xml" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/.idea/vcs.xml" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/coco_metric_back.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/coco_metric_new.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/evaluation_back.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/evaluation_new.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/losses_back.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/losses_new.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/mask_rcnn_model_back.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/mask_rcnn_model_new.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/postprocess_ops_back.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/postprocess_ops_new.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/coco_metric.py" beforeDir="false" afterPath="$PROJECT_DIR$/coco_metric.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/evaluation.py" beforeDir="false" afterPath="$PROJECT_DIR$/evaluation.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/losses.py" beforeDir="false" afterPath="$PROJECT_DIR$/losses.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/mask_rcnn_model.py" beforeDir="false" afterPath="$PROJECT_DIR$/mask_rcnn_model.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/postprocess_ops.py" beforeDir="false" afterPath="$PROJECT_DIR$/postprocess_ops.py" afterDir="false" />
+    </list>
+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileEditorManager">
+    <leaf>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/evaluation_new.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="375">
+              <caret line="38" column="50" lean-forward="true" selection-end-line="273" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/evaluation.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="750">
+              <caret line="273" selection-start-line="273" selection-end-line="273" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/losses_new.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="360">
+              <caret line="28" column="39" lean-forward="true" selection-end-line="243" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/losses.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="750">
+              <caret line="243" selection-start-line="243" selection-end-line="243" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/mask_rcnn_model_new.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="435">
+              <caret line="434" column="60" selection-end-line="798" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/mask_rcnn_model.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="11580">
+              <caret line="798" selection-start-line="798" selection-end-line="798" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/postprocess_ops_new.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="420">
+              <caret line="34" column="7" lean-forward="true" selection-end-line="279" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/postprocess_ops.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="750">
+              <caret line="279" selection-start-line="279" selection-end-line="279" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/configs/cloud/v3-8.yaml">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="240">
+              <caret line="16" lean-forward="true" selection-start-line="16" selection-end-line="16" />
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/README.md">
+          <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
+            <state split_layout="SPLIT">
+              <first_editor />
+              <second_editor />
+            </state>
+          </provider>
+        </entry>
+      </file>
+    </leaf>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/../../.." />
+  </component>
+  <component name="IdeDocumentHistory">
+    <option name="CHANGED_PATHS">
+      <list>
+        <option value="$PROJECT_DIR$/mask_rcnn_model_new.py" />
+        <option value="$PROJECT_DIR$/coco_metric.py" />
+        <option value="$PROJECT_DIR$/evaluation.py" />
+        <option value="$PROJECT_DIR$/losses.py" />
+        <option value="$PROJECT_DIR$/mask_rcnn_model.py" />
+        <option value="$PROJECT_DIR$/postprocess_ops.py" />
+      </list>
+    </option>
+  </component>
+  <component name="ProjectConfigurationFiles">
+    <option name="files">
+      <list>
+        <option value="$PROJECT_DIR$/.idea/vcs.xml" />
+      </list>
+    </option>
+  </component>
+  <component name="ProjectFrameBounds" extendedState="6">
+    <option name="x" value="187" />
+    <option name="y" value="116" />
+    <option name="width" value="1440" />
+    <option name="height" value="812" />
+  </component>
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectView">
+    <navigator proportions="" version="1">
+      <foldersAlwaysOnTop value="true" />
+    </navigator>
+    <panes>
+      <pane id="ProjectPane">
+        <subPane>
+          <expand>
+            <path>
+              <item name="mask_rcnn" type="b2602c69:ProjectViewProjectNode" />
+              <item name="mask_rcnn" type="462c0819:PsiDirectoryNode" />
+            </path>
+          </expand>
+          <select />
+        </subPane>
+      </pane>
+      <pane id="Scope" />
+    </panes>
+  </component>
+  <component name="PropertiesComponent">
+    <property name="SHARE_PROJECT_CONFIGURATION_FILES" value="true" />
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
+    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$" />
+    </key>
+  </component>
+  <component name="RunDashboard">
+    <option name="ruleStates">
+      <list>
+        <RuleState>
+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
+        </RuleState>
+        <RuleState>
+          <option name="name" value="StatusDashboardGroupingRule" />
+        </RuleState>
+      </list>
+    </option>
+  </component>
+  <component name="SvnConfiguration">
+    <configuration />
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="73a3e9cc-360a-42e4-b4b7-5598e240f823" name="Default Changelist" comment="" />
+      <created>1597650638652</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1597650638652</updated>
+      <workItem from="1597650639733" duration="1505000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TimeTrackingManager">
+    <option name="totallyTimeSpent" value="1505000" />
+  </component>
+  <component name="ToolWindowManager">
+    <frame x="0" y="25" width="1920" height="990" extended-state="6" />
+    <layout>
+      <window_info id="Favorites" side_tool="true" />
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.24973205" />
+      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
+      <window_info anchor="bottom" id="Docker" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Database Changes" />
+      <window_info anchor="bottom" id="Version Control" />
+      <window_info anchor="bottom" id="Python Console" />
+      <window_info anchor="bottom" id="Terminal" />
+      <window_info anchor="bottom" id="Event Log" side_tool="true" />
+      <window_info anchor="bottom" id="Message" order="0" />
+      <window_info anchor="bottom" id="Find" order="1" />
+      <window_info anchor="bottom" id="Run" order="2" />
+      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
+      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
+      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
+      <window_info anchor="bottom" id="TODO" order="6" />
+      <window_info anchor="right" id="SciView" />
+      <window_info anchor="right" id="Database" />
+      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
+      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
+      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
+    </layout>
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="1" />
+  </component>
+  <component name="editorHistoryManager">
+    <entry file="file://$PROJECT_DIR$/coco_metric_back.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/evaluation_back.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/losses_back.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/mask_rcnn_model_back.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/postprocess_ops_back.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="435">
+          <caret line="35" column="7" lean-forward="true" selection-start-line="35" selection-start-column="7" selection-end-line="35" selection-end-column="7" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/coco_metric_new.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state>
+          <caret selection-end-line="340" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/coco_metric.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="750">
+          <caret line="340" selection-start-line="340" selection-end-line="340" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/evaluation_new.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="375">
+          <caret line="38" column="50" lean-forward="true" selection-end-line="273" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/losses_new.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="360">
+          <caret line="28" column="39" lean-forward="true" selection-end-line="243" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/losses.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="750">
+          <caret line="243" selection-start-line="243" selection-end-line="243" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/mask_rcnn_model_new.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="435">
+          <caret line="434" column="60" selection-end-line="798" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/mask_rcnn_model.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="11580">
+          <caret line="798" selection-start-line="798" selection-end-line="798" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/postprocess_ops_new.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="420">
+          <caret line="34" column="7" lean-forward="true" selection-end-line="279" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/README.md">
+      <provider selected="true" editor-type-id="split-provider[text-editor;markdown-preview-editor]">
+        <state split_layout="SPLIT">
+          <first_editor />
+          <second_editor />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/configs/cloud/v2-32.yaml">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="270">
+          <caret line="18" lean-forward="true" selection-start-line="18" selection-end-line="18" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/postprocess_ops.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="750">
+          <caret line="279" selection-start-line="279" selection-end-line="279" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/configs/cloud/v3-8.yaml">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="240">
+          <caret line="16" lean-forward="true" selection-start-line="16" selection-end-line="16" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/evaluation.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="750">
+          <caret line="273" selection-start-line="273" selection-end-line="273" />
+        </state>
+      </provider>
+    </entry>
+  </component>
+</project>
\ No newline at end of file
diff --git a/models/official/mask_rcnn/coco_metric.py b/models/official/mask_rcnn/coco_metric.py
index ea00a3255..47645fbdc 100644
--- a/models/official/mask_rcnn/coco_metric.py
+++ b/models/official/mask_rcnn/coco_metric.py
@@ -134,7 +134,8 @@ def load_predictions(self,
     for i, image_id in enumerate(detection_results['source_id']):
 
       if include_mask:
-        box_coorindates_in_image = detection_results['detection_boxes'][i]
+        # box_coorindates_in_image = detection_results['detection_boxes'][i]
+        box_coorindates_in_image = detection_results['selected_box_rois'][i]
         segments = generate_segmentation_from_masks(
             detection_results['detection_masks'][i],
             box_coorindates_in_image,
diff --git a/models/official/mask_rcnn/coco_metric_back.py b/models/official/mask_rcnn/coco_metric_back.py
new file mode 100644
index 000000000..ea00a3255
--- /dev/null
+++ b/models/official/mask_rcnn/coco_metric_back.py
@@ -0,0 +1,339 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import atexit
+import copy
+import tempfile
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as maskUtils
+import tensorflow.compat.v1 as tf
+import cv2
+
+
+class MaskCOCO(COCO):
+  """COCO object for mask evaluation.
+  """
+
+  def reset(self, dataset):
+    """Reset the dataset and groundtruth data index in this object.
+
+    Args:
+      dataset: dict of groundtruth data. It should has similar structure as the
+        COCO groundtruth JSON file. Must contains three keys: {'images',
+          'annotations', 'categories'}.
+        'images': list of image information dictionary. Required keys: 'id',
+          'width' and 'height'.
+        'annotations': list of dict. Bounding boxes and segmentations related
+          information. Required keys: {'id', 'image_id', 'category_id', 'bbox',
+            'iscrowd', 'area', 'segmentation'}.
+        'categories': list of dict of the category information.
+          Required key: 'id'.
+        Refer to http://cocodataset.org/#format-data for more details.
+
+    Raises:
+      AttributeError: If the dataset is empty or not a dict.
+    """
+    assert dataset, 'Groundtruth should not be empty.'
+    assert isinstance(dataset,
+                      dict), 'annotation file format {} not supported'.format(
+                          type(dataset))
+    self.anns, self.cats, self.imgs = dict(), dict(), dict()
+    self.dataset = copy.deepcopy(dataset)
+    self.createIndex()
+
+  def loadRes(self, detection_results, include_mask, is_image_mask=False):
+    """Load result file and return a result api object.
+
+    Args:
+      detection_results: a dictionary containing predictions results.
+      include_mask: a boolean, whether to include mask in detection results.
+      is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+    Returns:
+      res: result MaskCOCO api object
+    """
+    res = MaskCOCO()
+    res.dataset['images'] = [img for img in self.dataset['images']]
+    print('Loading and preparing results...')
+    predictions = self.load_predictions(
+        detection_results,
+        include_mask=include_mask,
+        is_image_mask=is_image_mask)
+    assert isinstance(predictions, list), 'results in not an array of objects'
+    if predictions:
+      image_ids = [pred['image_id'] for pred in predictions]
+      assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \
+             'Results do not correspond to current coco set'
+
+      if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']):
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for idx, pred in enumerate(predictions):
+          bb = pred['bbox']
+          x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+          if 'segmentation' not in pred:
+            pred['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+          pred['area'] = bb[2] * bb[3]
+          pred['id'] = idx + 1
+          pred['iscrowd'] = 0
+      elif 'segmentation' in predictions[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for idx, pred in enumerate(predictions):
+          # now only support compressed RLE format as segmentation results
+          pred['area'] = maskUtils.area(pred['segmentation'])
+          if 'bbox' not in pred:
+            pred['bbox'] = maskUtils.toBbox(pred['segmentation'])
+          pred['id'] = idx + 1
+          pred['iscrowd'] = 0
+
+      res.dataset['annotations'] = predictions
+
+    res.createIndex()
+    return res
+
+  def load_predictions(self,
+                       detection_results,
+                       include_mask,
+                       is_image_mask=False):
+    """Create prediction dictionary list from detection and mask results.
+
+    Args:
+      detection_results: a dictionary containing numpy arrays which corresponds
+        to prediction results.
+      include_mask: a boolean, whether to include mask in detection results.
+      is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+    Returns:
+      a list of dictionary including different prediction results from the model
+        in numpy form.
+    """
+    predictions = []
+    num_detections = detection_results['detection_scores'].size
+    current_index = 0
+    for i, image_id in enumerate(detection_results['source_id']):
+
+      if include_mask:
+        box_coorindates_in_image = detection_results['detection_boxes'][i]
+        segments = generate_segmentation_from_masks(
+            detection_results['detection_masks'][i],
+            box_coorindates_in_image,
+            int(detection_results['image_info'][i][3]),
+            int(detection_results['image_info'][i][4]),
+            is_image_mask=is_image_mask)
+
+        # Convert the mask to uint8 and then to fortranarray for RLE encoder.
+        encoded_masks = [
+            maskUtils.encode(np.asfortranarray(instance_mask.astype(np.uint8)))
+            for instance_mask in segments
+        ]
+
+      for box_index in range(int(detection_results['num_detections'][i])):
+        if current_index % 1000000 == 0:
+          print('{}/{}'.format(current_index, num_detections))
+        current_index += 1
+
+        prediction = {
+            'image_id': int(image_id),
+            'bbox': detection_results['detection_boxes'][i][box_index].tolist(),
+            'score': detection_results['detection_scores'][i][box_index],
+            'category_id': int(
+                detection_results['detection_classes'][i][box_index]),
+        }
+
+        if include_mask:
+          prediction['segmentation'] = encoded_masks[box_index]
+
+        predictions.append(prediction)
+
+    return predictions
+
+
+def generate_segmentation_from_masks(masks,
+                                     detected_boxes,
+                                     image_height,
+                                     image_width,
+                                     is_image_mask=False):
+  """Generates segmentation result from instance masks.
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+    is_image_mask: bool. True: input masks are whole-image masks. False: input
+      masks are bounding-box level masks.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+
+  def expand_boxes(boxes, scale):
+    """Expands an array of boxes by a given scale."""
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
+    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+    # whereas `boxes` here is in [x1, y1, w, h] form
+    w_half = boxes[:, 2] * .5
+    h_half = boxes[:, 3] * .5
+    x_c = boxes[:, 0] + w_half
+    y_c = boxes[:, 1] + h_half
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+
+    return boxes_exp
+
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812  # pylint: disable=line-too-long
+  # To work around an issue with cv2.resize (it seems to automatically pad
+  # with repeated border values), we manually zero-pad the masks by 1 pixel
+  # prior to resizing back to the original image resolution. This prevents
+  # "top hat" artifacts. We therefore need to expand the reference boxes by an
+  # appropriate factor.
+  _, mask_height, mask_width = masks.shape
+  scale = max((mask_width + 2.0) / mask_width,
+              (mask_height + 2.0) / mask_height)
+
+  ref_boxes = expand_boxes(detected_boxes, scale)
+  ref_boxes = ref_boxes.astype(np.int32)
+  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+  segms = []
+  for mask_ind, mask in enumerate(masks):
+    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+    if is_image_mask:
+      # Process whole-image masks.
+      im_mask[:, :] = mask[:, :]
+    else:
+      # Process mask inside bounding boxes.
+      padded_mask[1:-1, 1:-1] = mask[:, :]
+
+      ref_box = ref_boxes[mask_ind, :]
+      w = ref_box[2] - ref_box[0] + 1
+      h = ref_box[3] - ref_box[1] + 1
+      w = np.maximum(w, 1)
+      h = np.maximum(h, 1)
+
+      mask = cv2.resize(padded_mask, (w, h))
+      mask = np.array(mask > 0.5, dtype=np.uint8)
+
+      x_0 = max(ref_box[0], 0)
+      x_1 = min(ref_box[2] + 1, image_width)
+      y_0 = max(ref_box[1], 0)
+      y_1 = min(ref_box[3] + 1, image_height)
+
+      im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (
+          x_0 - ref_box[0]):(x_1 - ref_box[0])]
+    segms.append(im_mask)
+
+  segms = np.array(segms)
+  assert masks.shape[0] == segms.shape[0]
+  return segms
+
+
+class EvaluationMetric(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self, filename, include_mask):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _evaluate() loads a JSON file in COCO annotation format as the
+    groundtruths and runs COCO evaluation.
+
+    Args:
+      filename: Ground truth JSON file name. If filename is None, use
+        groundtruth data passed from the dataloader for evaluation.
+      include_mask: boolean to indicate whether or not to include mask eval.
+    """
+    if filename:
+      if filename.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.gfile.Remove(local_val_json)
+
+        tf.gfile.Copy(filename, local_val_json)
+        atexit.register(tf.gfile.Remove, local_val_json)
+      else:
+        local_val_json = filename
+      self.coco_gt = MaskCOCO(local_val_json)
+    self.filename = filename
+    self.metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+                         'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+    self._include_mask = include_mask
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self.metric_names]
+      self.metric_names.extend(mask_metric_names)
+
+    self._reset()
+
+  def _reset(self):
+    """Reset COCO API object."""
+    if self.filename is None and not hasattr(self, 'coco_gt'):
+      self.coco_gt = MaskCOCO()
+
+  def predict_metric_fn(self,
+                        predictions,
+                        is_predict_image_mask=False,
+                        groundtruth_data=None):
+    """Generates COCO metrics."""
+    image_ids = list(set(predictions['source_id']))
+    if groundtruth_data is not None:
+      self.coco_gt.reset(groundtruth_data)
+    coco_dt = self.coco_gt.loadRes(
+        predictions, self._include_mask, is_image_mask=is_predict_image_mask)
+    coco_eval = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      # Create another object for instance segmentation metric evaluation.
+      mcoco_eval = COCOeval(self.coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    # clean up after evaluation is done.
+    self._reset()
+    metrics = metrics.astype(np.float32)
+
+    metrics_dict = {}
+    for i, name in enumerate(self.metric_names):
+      metrics_dict[name] = metrics[i]
+    return metrics_dict
diff --git a/models/official/mask_rcnn/coco_metric_new.py b/models/official/mask_rcnn/coco_metric_new.py
new file mode 100644
index 000000000..47645fbdc
--- /dev/null
+++ b/models/official/mask_rcnn/coco_metric_new.py
@@ -0,0 +1,340 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import atexit
+import copy
+import tempfile
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as maskUtils
+import tensorflow.compat.v1 as tf
+import cv2
+
+
+class MaskCOCO(COCO):
+  """COCO object for mask evaluation.
+  """
+
+  def reset(self, dataset):
+    """Reset the dataset and groundtruth data index in this object.
+
+    Args:
+      dataset: dict of groundtruth data. It should has similar structure as the
+        COCO groundtruth JSON file. Must contains three keys: {'images',
+          'annotations', 'categories'}.
+        'images': list of image information dictionary. Required keys: 'id',
+          'width' and 'height'.
+        'annotations': list of dict. Bounding boxes and segmentations related
+          information. Required keys: {'id', 'image_id', 'category_id', 'bbox',
+            'iscrowd', 'area', 'segmentation'}.
+        'categories': list of dict of the category information.
+          Required key: 'id'.
+        Refer to http://cocodataset.org/#format-data for more details.
+
+    Raises:
+      AttributeError: If the dataset is empty or not a dict.
+    """
+    assert dataset, 'Groundtruth should not be empty.'
+    assert isinstance(dataset,
+                      dict), 'annotation file format {} not supported'.format(
+                          type(dataset))
+    self.anns, self.cats, self.imgs = dict(), dict(), dict()
+    self.dataset = copy.deepcopy(dataset)
+    self.createIndex()
+
+  def loadRes(self, detection_results, include_mask, is_image_mask=False):
+    """Load result file and return a result api object.
+
+    Args:
+      detection_results: a dictionary containing predictions results.
+      include_mask: a boolean, whether to include mask in detection results.
+      is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+    Returns:
+      res: result MaskCOCO api object
+    """
+    res = MaskCOCO()
+    res.dataset['images'] = [img for img in self.dataset['images']]
+    print('Loading and preparing results...')
+    predictions = self.load_predictions(
+        detection_results,
+        include_mask=include_mask,
+        is_image_mask=is_image_mask)
+    assert isinstance(predictions, list), 'results in not an array of objects'
+    if predictions:
+      image_ids = [pred['image_id'] for pred in predictions]
+      assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \
+             'Results do not correspond to current coco set'
+
+      if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']):
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for idx, pred in enumerate(predictions):
+          bb = pred['bbox']
+          x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+          if 'segmentation' not in pred:
+            pred['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+          pred['area'] = bb[2] * bb[3]
+          pred['id'] = idx + 1
+          pred['iscrowd'] = 0
+      elif 'segmentation' in predictions[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for idx, pred in enumerate(predictions):
+          # now only support compressed RLE format as segmentation results
+          pred['area'] = maskUtils.area(pred['segmentation'])
+          if 'bbox' not in pred:
+            pred['bbox'] = maskUtils.toBbox(pred['segmentation'])
+          pred['id'] = idx + 1
+          pred['iscrowd'] = 0
+
+      res.dataset['annotations'] = predictions
+
+    res.createIndex()
+    return res
+
+  def load_predictions(self,
+                       detection_results,
+                       include_mask,
+                       is_image_mask=False):
+    """Create prediction dictionary list from detection and mask results.
+
+    Args:
+      detection_results: a dictionary containing numpy arrays which corresponds
+        to prediction results.
+      include_mask: a boolean, whether to include mask in detection results.
+      is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+    Returns:
+      a list of dictionary including different prediction results from the model
+        in numpy form.
+    """
+    predictions = []
+    num_detections = detection_results['detection_scores'].size
+    current_index = 0
+    for i, image_id in enumerate(detection_results['source_id']):
+
+      if include_mask:
+        # box_coorindates_in_image = detection_results['detection_boxes'][i]
+        box_coorindates_in_image = detection_results['selected_box_rois'][i]
+        segments = generate_segmentation_from_masks(
+            detection_results['detection_masks'][i],
+            box_coorindates_in_image,
+            int(detection_results['image_info'][i][3]),
+            int(detection_results['image_info'][i][4]),
+            is_image_mask=is_image_mask)
+
+        # Convert the mask to uint8 and then to fortranarray for RLE encoder.
+        encoded_masks = [
+            maskUtils.encode(np.asfortranarray(instance_mask.astype(np.uint8)))
+            for instance_mask in segments
+        ]
+
+      for box_index in range(int(detection_results['num_detections'][i])):
+        if current_index % 1000000 == 0:
+          print('{}/{}'.format(current_index, num_detections))
+        current_index += 1
+
+        prediction = {
+            'image_id': int(image_id),
+            'bbox': detection_results['detection_boxes'][i][box_index].tolist(),
+            'score': detection_results['detection_scores'][i][box_index],
+            'category_id': int(
+                detection_results['detection_classes'][i][box_index]),
+        }
+
+        if include_mask:
+          prediction['segmentation'] = encoded_masks[box_index]
+
+        predictions.append(prediction)
+
+    return predictions
+
+
+def generate_segmentation_from_masks(masks,
+                                     detected_boxes,
+                                     image_height,
+                                     image_width,
+                                     is_image_mask=False):
+  """Generates segmentation result from instance masks.
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+    is_image_mask: bool. True: input masks are whole-image masks. False: input
+      masks are bounding-box level masks.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+
+  def expand_boxes(boxes, scale):
+    """Expands an array of boxes by a given scale."""
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227  # pylint: disable=line-too-long
+    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+    # whereas `boxes` here is in [x1, y1, w, h] form
+    w_half = boxes[:, 2] * .5
+    h_half = boxes[:, 3] * .5
+    x_c = boxes[:, 0] + w_half
+    y_c = boxes[:, 1] + h_half
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+
+    return boxes_exp
+
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812  # pylint: disable=line-too-long
+  # To work around an issue with cv2.resize (it seems to automatically pad
+  # with repeated border values), we manually zero-pad the masks by 1 pixel
+  # prior to resizing back to the original image resolution. This prevents
+  # "top hat" artifacts. We therefore need to expand the reference boxes by an
+  # appropriate factor.
+  _, mask_height, mask_width = masks.shape
+  scale = max((mask_width + 2.0) / mask_width,
+              (mask_height + 2.0) / mask_height)
+
+  ref_boxes = expand_boxes(detected_boxes, scale)
+  ref_boxes = ref_boxes.astype(np.int32)
+  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+  segms = []
+  for mask_ind, mask in enumerate(masks):
+    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+    if is_image_mask:
+      # Process whole-image masks.
+      im_mask[:, :] = mask[:, :]
+    else:
+      # Process mask inside bounding boxes.
+      padded_mask[1:-1, 1:-1] = mask[:, :]
+
+      ref_box = ref_boxes[mask_ind, :]
+      w = ref_box[2] - ref_box[0] + 1
+      h = ref_box[3] - ref_box[1] + 1
+      w = np.maximum(w, 1)
+      h = np.maximum(h, 1)
+
+      mask = cv2.resize(padded_mask, (w, h))
+      mask = np.array(mask > 0.5, dtype=np.uint8)
+
+      x_0 = max(ref_box[0], 0)
+      x_1 = min(ref_box[2] + 1, image_width)
+      y_0 = max(ref_box[1], 0)
+      y_1 = min(ref_box[3] + 1, image_height)
+
+      im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (
+          x_0 - ref_box[0]):(x_1 - ref_box[0])]
+    segms.append(im_mask)
+
+  segms = np.array(segms)
+  assert masks.shape[0] == segms.shape[0]
+  return segms
+
+
+class EvaluationMetric(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self, filename, include_mask):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _evaluate() loads a JSON file in COCO annotation format as the
+    groundtruths and runs COCO evaluation.
+
+    Args:
+      filename: Ground truth JSON file name. If filename is None, use
+        groundtruth data passed from the dataloader for evaluation.
+      include_mask: boolean to indicate whether or not to include mask eval.
+    """
+    if filename:
+      if filename.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.gfile.Remove(local_val_json)
+
+        tf.gfile.Copy(filename, local_val_json)
+        atexit.register(tf.gfile.Remove, local_val_json)
+      else:
+        local_val_json = filename
+      self.coco_gt = MaskCOCO(local_val_json)
+    self.filename = filename
+    self.metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+                         'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+    self._include_mask = include_mask
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self.metric_names]
+      self.metric_names.extend(mask_metric_names)
+
+    self._reset()
+
+  def _reset(self):
+    """Reset COCO API object."""
+    if self.filename is None and not hasattr(self, 'coco_gt'):
+      self.coco_gt = MaskCOCO()
+
+  def predict_metric_fn(self,
+                        predictions,
+                        is_predict_image_mask=False,
+                        groundtruth_data=None):
+    """Generates COCO metrics."""
+    image_ids = list(set(predictions['source_id']))
+    if groundtruth_data is not None:
+      self.coco_gt.reset(groundtruth_data)
+    coco_dt = self.coco_gt.loadRes(
+        predictions, self._include_mask, is_image_mask=is_predict_image_mask)
+    coco_eval = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      # Create another object for instance segmentation metric evaluation.
+      mcoco_eval = COCOeval(self.coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    # clean up after evaluation is done.
+    self._reset()
+    metrics = metrics.astype(np.float32)
+
+    metrics_dict = {}
+    for i, name in enumerate(self.metric_names):
+      metrics_dict[name] = metrics[i]
+    return metrics_dict
diff --git a/models/official/mask_rcnn/evaluation.py b/models/official/mask_rcnn/evaluation.py
index 09a971da3..608b41e5d 100644
--- a/models/official/mask_rcnn/evaluation.py
+++ b/models/official/mask_rcnn/evaluation.py
@@ -45,6 +45,21 @@ def process_prediction_for_eval(prediction):
       new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
       processed_box_coordinates[image_id, box_id, :] = new_box
   prediction['detection_boxes'] = processed_box_coordinates
+
+  # +++
+  box_coordinates_rois = prediction['selected_box_rois']
+  box_coordinates_rois_coordinates = np.zeros_like(box_coordinates_rois)
+
+  for image_id in range(box_coordinates_rois.shape[0]):
+      scale = image_info[image_id][2]
+      for box_id in range(box_coordinates_rois.shape[1]):
+          # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+          # by image scale.
+          y1, x1, y2, x2 = box_coordinates_rois[image_id, box_id, :]
+          new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+          box_coordinates_rois_coordinates[image_id, box_id, :] = new_box
+  prediction['selected_box_rois'] = box_coordinates_rois_coordinates
+
   return prediction
 
 
@@ -208,6 +223,7 @@ def write_image_summary(predictions, summary_writer, current_step):
           len(predictions['detection_boxes'][i]),
           int(predictions['num_detections'][i]))
       detection_boxes = predictions['detection_boxes'][i][:num_detections]
+      selected_box_rois = predictions['selected_box_rois'][i][:num_detections]
       detection_scores = predictions['detection_scores'][i][:num_detections]
       detection_classes = predictions['detection_classes'][i][:num_detections]
 
@@ -221,6 +237,10 @@ def write_image_summary(predictions, summary_writer, current_step):
       detection_boxes = detection_boxes * np.array(
           [image_width, image_height, image_width, image_height])
 
+      selected_box_rois = selected_box_rois / np.array([w, h, w, h])
+      selected_box_rois = selected_box_rois * np.array(
+          [image_width, image_height, image_width, image_height])
+
       gt_boxes = None
       if 'groundtruth_boxes' in predictions:
         gt_boxes = predictions['groundtruth_boxes'][i]
@@ -231,7 +251,7 @@ def write_image_summary(predictions, summary_writer, current_step):
       if include_mask:
         instance_masks = predictions['detection_masks'][i][0:num_detections]
         segmentations = coco_metric.generate_segmentation_from_masks(
-            instance_masks, detection_boxes, image_height, image_width)
+            instance_masks, selected_box_rois, image_height, image_width)
 
       # From [x, y, w, h] to [x1, y1, x2, y2] and
       # process_prediction_for_eval() set the box to be [x, y] format, need to
diff --git a/models/official/mask_rcnn/evaluation_back.py b/models/official/mask_rcnn/evaluation_back.py
new file mode 100644
index 000000000..09a971da3
--- /dev/null
+++ b/models/official/mask_rcnn/evaluation_back.py
@@ -0,0 +1,253 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to perform COCO evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+from absl import logging
+import numpy as np
+from PIL import Image
+import six
+import tensorflow.compat.v1 as tf
+
+import coco_metric
+import coco_utils
+from object_detection import visualization_utils
+
+
+def process_prediction_for_eval(prediction):
+  """Process the model prediction for COCO eval."""
+  image_info = prediction['image_info']
+  box_coordinates = prediction['detection_boxes']
+  processed_box_coordinates = np.zeros_like(box_coordinates)
+
+  for image_id in range(box_coordinates.shape[0]):
+    scale = image_info[image_id][2]
+    for box_id in range(box_coordinates.shape[1]):
+      # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+      # by image scale.
+      y1, x1, y2, x2 = box_coordinates[image_id, box_id, :]
+      new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+      processed_box_coordinates[image_id, box_id, :] = new_box
+  prediction['detection_boxes'] = processed_box_coordinates
+  return prediction
+
+
+def compute_coco_eval_metric(predictor,
+                             num_batches=-1,
+                             include_mask=True,
+                             annotation_json_file=None):
+  """Compute COCO eval metric given a prediction generator.
+
+  Args:
+    predictor: a generator that iteratively pops a dictionary of predictions
+      with the format compatible with COCO eval tool.
+    num_batches: the number of batches to be aggregated in eval. This is how
+      many times that the predictor gets pulled.
+    include_mask: a boolean that indicates whether we include the mask eval.
+    annotation_json_file: the annotation json file of the eval dataset.
+
+  Returns:
+    eval_results: the aggregated COCO metric eval results.
+  """
+  del num_batches
+
+  if not annotation_json_file:
+    annotation_json_file = None
+  use_groundtruth_from_json = (annotation_json_file is not None)
+
+  batch_idx = 0
+  predictions = dict()
+  while True:
+    try:
+      prediction = six.next(predictor)
+      logging.info('Running inference on batch %d...', (batch_idx + 1))
+    except StopIteration:
+      logging.info('Finished the eval set at %d batch.', (batch_idx + 1))
+      break
+
+    prediction = process_prediction_for_eval(prediction)
+    for k, v in six.iteritems(prediction):
+      if k not in predictions:
+        predictions[k] = [v]
+      else:
+        predictions[k].append(v)
+
+    batch_idx = batch_idx + 1
+
+  for k, v in six.iteritems(predictions):
+    predictions[k] = np.concatenate(predictions[k], axis=0)
+
+  if 'orig_images' in predictions and predictions['orig_images'].shape[0] > 10:
+    # Only samples a few images for visualization.
+    predictions['orig_images'] = predictions['orig_images'][:10]
+
+  if use_groundtruth_from_json:
+    eval_metric = coco_metric.EvaluationMetric(
+        annotation_json_file, include_mask=include_mask)
+    eval_results = eval_metric.predict_metric_fn(predictions)
+  else:
+    images, annotations = coco_utils.extract_coco_groundtruth(
+        predictions, include_mask)
+    dataset = coco_utils.create_coco_format_dataset(images, annotations)
+    eval_metric = coco_metric.EvaluationMetric(
+        filename=None, include_mask=include_mask)
+    eval_results = eval_metric.predict_metric_fn(
+        predictions, groundtruth_data=dataset)
+  logging.info('Eval results: %s', eval_results)
+  return eval_results, predictions
+
+
+def evaluate(eval_estimator,
+             input_fn,
+             num_eval_samples,
+             eval_batch_size,
+             include_mask=True,
+             validation_json_file=None):
+  """Runs COCO evaluation once."""
+  predictor = eval_estimator.predict(
+      input_fn=input_fn, yield_single_examples=False)
+  # Every predictor.next() gets a batch of prediction (a dictionary).
+  num_eval_times = num_eval_samples // eval_batch_size
+  assert num_eval_times > 0, 'num_eval_samples >= eval_batch_size!'
+  eval_results, predictions = compute_coco_eval_metric(predictor,
+                                                       num_eval_times,
+                                                       include_mask,
+                                                       validation_json_file)
+  return eval_results, predictions
+
+
+def write_summary(eval_results, summary_writer, current_step, predictions=None):
+  """Write out eval results for the checkpoint."""
+  with tf.Graph().as_default():
+    summaries = []
+    for metric in eval_results:
+      summaries.append(
+          tf.Summary.Value(tag=metric, simple_value=eval_results[metric]))
+    tf_summary = tf.Summary(value=list(summaries))
+    summary_writer.add_summary(tf_summary, current_step)
+  write_image_summary(predictions, summary_writer, current_step)
+
+
+def create_image_summary(image,
+                         boxes,
+                         scores,
+                         classes,
+                         gt_boxes=None,
+                         segmentations=None):
+  """Creates an image summary given predictions."""
+  max_boxes_to_draw = 100
+  min_score_thresh = 0.1
+
+  # Visualizes the predicitons.
+  image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+      image,
+      boxes,
+      classes=classes,
+      scores=scores,
+      category_index={},
+      instance_masks=segmentations,
+      use_normalized_coordinates=False,
+      max_boxes_to_draw=max_boxes_to_draw,
+      min_score_thresh=min_score_thresh,
+      agnostic_mode=False)
+  if gt_boxes is not None:
+    # Visualizes the groundtruth boxes. They are in black by default.
+    image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+        image_with_detections,
+        gt_boxes,
+        classes=None,
+        scores=None,
+        category_index={},
+        use_normalized_coordinates=False,
+        max_boxes_to_draw=max_boxes_to_draw,
+        agnostic_mode=True)
+  buf = io.BytesIO()
+  w, h = image_with_detections.shape[:2]
+  ratio = 1024 / w
+  new_size = [int(w * ratio), int(h * ratio)]
+  image = Image.fromarray(image_with_detections.astype(np.uint8))
+  image.thumbnail(new_size)
+  image.save(buf, format='png')
+  image_summary = tf.Summary.Image(encoded_image_string=buf.getvalue())
+  return image_summary
+
+
+def write_image_summary(predictions, summary_writer, current_step):
+  """Write out image and prediction for summary."""
+  if not predictions or not isinstance(predictions, dict):
+    return
+  if 'orig_images' not in predictions:
+    logging.info('Missing orig_images in predictions: %s',
+                 predictions.keys())
+    return
+  predictions['orig_images'] = predictions['orig_images'] * 255
+  predictions['orig_images'] = predictions['orig_images'].astype(np.uint8)
+  num_images = predictions['orig_images'].shape[0]
+  include_mask = ('detection_masks' in predictions)
+
+  with tf.Graph().as_default():
+    summaries = []
+    for i in xrange(num_images):
+      num_detections = min(
+          len(predictions['detection_boxes'][i]),
+          int(predictions['num_detections'][i]))
+      detection_boxes = predictions['detection_boxes'][i][:num_detections]
+      detection_scores = predictions['detection_scores'][i][:num_detections]
+      detection_classes = predictions['detection_classes'][i][:num_detections]
+
+      image = predictions['orig_images'][i]
+      image_height = image.shape[0]
+      image_width = image.shape[1]
+
+      # Rescale the box to fit the visualization image.
+      h, w = predictions['image_info'][i][3:5]
+      detection_boxes = detection_boxes / np.array([w, h, w, h])
+      detection_boxes = detection_boxes * np.array(
+          [image_width, image_height, image_width, image_height])
+
+      gt_boxes = None
+      if 'groundtruth_boxes' in predictions:
+        gt_boxes = predictions['groundtruth_boxes'][i]
+        gt_boxes = gt_boxes * np.array(
+            [image_height, image_width, image_height, image_width])
+
+      segmentations = None
+      if include_mask:
+        instance_masks = predictions['detection_masks'][i][0:num_detections]
+        segmentations = coco_metric.generate_segmentation_from_masks(
+            instance_masks, detection_boxes, image_height, image_width)
+
+      # From [x, y, w, h] to [x1, y1, x2, y2] and
+      # process_prediction_for_eval() set the box to be [x, y] format, need to
+      # reverted them to [y, x] format.
+      xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1)
+      xmax = xmin + w
+      ymax = ymin + h
+      boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1)
+      image_summary = create_image_summary(
+          image,
+          boxes=boxes_to_visualize,
+          scores=detection_scores,
+          classes=detection_classes.astype(np.int32),
+          gt_boxes=gt_boxes,
+          segmentations=segmentations)
+      image_value = tf.Summary.Value(tag='%d_input' % i, image=image_summary)
+      summaries.append(image_value)
+    tf_summary = tf.Summary(value=list(summaries))
+    summary_writer.add_summary(tf_summary, current_step)
diff --git a/models/official/mask_rcnn/evaluation_new.py b/models/official/mask_rcnn/evaluation_new.py
new file mode 100644
index 000000000..608b41e5d
--- /dev/null
+++ b/models/official/mask_rcnn/evaluation_new.py
@@ -0,0 +1,273 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions to perform COCO evaluation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import io
+from absl import logging
+import numpy as np
+from PIL import Image
+import six
+import tensorflow.compat.v1 as tf
+
+import coco_metric
+import coco_utils
+from object_detection import visualization_utils
+
+
+def process_prediction_for_eval(prediction):
+  """Process the model prediction for COCO eval."""
+  image_info = prediction['image_info']
+  box_coordinates = prediction['detection_boxes']
+  processed_box_coordinates = np.zeros_like(box_coordinates)
+
+  for image_id in range(box_coordinates.shape[0]):
+    scale = image_info[image_id][2]
+    for box_id in range(box_coordinates.shape[1]):
+      # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+      # by image scale.
+      y1, x1, y2, x2 = box_coordinates[image_id, box_id, :]
+      new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+      processed_box_coordinates[image_id, box_id, :] = new_box
+  prediction['detection_boxes'] = processed_box_coordinates
+
+  # +++
+  box_coordinates_rois = prediction['selected_box_rois']
+  box_coordinates_rois_coordinates = np.zeros_like(box_coordinates_rois)
+
+  for image_id in range(box_coordinates_rois.shape[0]):
+      scale = image_info[image_id][2]
+      for box_id in range(box_coordinates_rois.shape[1]):
+          # Map [y1, x1, y2, x2] -> [x1, y1, w, h] and multiply detections
+          # by image scale.
+          y1, x1, y2, x2 = box_coordinates_rois[image_id, box_id, :]
+          new_box = scale * np.array([x1, y1, x2 - x1, y2 - y1])
+          box_coordinates_rois_coordinates[image_id, box_id, :] = new_box
+  prediction['selected_box_rois'] = box_coordinates_rois_coordinates
+
+  return prediction
+
+
+def compute_coco_eval_metric(predictor,
+                             num_batches=-1,
+                             include_mask=True,
+                             annotation_json_file=None):
+  """Compute COCO eval metric given a prediction generator.
+
+  Args:
+    predictor: a generator that iteratively pops a dictionary of predictions
+      with the format compatible with COCO eval tool.
+    num_batches: the number of batches to be aggregated in eval. This is how
+      many times that the predictor gets pulled.
+    include_mask: a boolean that indicates whether we include the mask eval.
+    annotation_json_file: the annotation json file of the eval dataset.
+
+  Returns:
+    eval_results: the aggregated COCO metric eval results.
+  """
+  del num_batches
+
+  if not annotation_json_file:
+    annotation_json_file = None
+  use_groundtruth_from_json = (annotation_json_file is not None)
+
+  batch_idx = 0
+  predictions = dict()
+  while True:
+    try:
+      prediction = six.next(predictor)
+      logging.info('Running inference on batch %d...', (batch_idx + 1))
+    except StopIteration:
+      logging.info('Finished the eval set at %d batch.', (batch_idx + 1))
+      break
+
+    prediction = process_prediction_for_eval(prediction)
+    for k, v in six.iteritems(prediction):
+      if k not in predictions:
+        predictions[k] = [v]
+      else:
+        predictions[k].append(v)
+
+    batch_idx = batch_idx + 1
+
+  for k, v in six.iteritems(predictions):
+    predictions[k] = np.concatenate(predictions[k], axis=0)
+
+  if 'orig_images' in predictions and predictions['orig_images'].shape[0] > 10:
+    # Only samples a few images for visualization.
+    predictions['orig_images'] = predictions['orig_images'][:10]
+
+  if use_groundtruth_from_json:
+    eval_metric = coco_metric.EvaluationMetric(
+        annotation_json_file, include_mask=include_mask)
+    eval_results = eval_metric.predict_metric_fn(predictions)
+  else:
+    images, annotations = coco_utils.extract_coco_groundtruth(
+        predictions, include_mask)
+    dataset = coco_utils.create_coco_format_dataset(images, annotations)
+    eval_metric = coco_metric.EvaluationMetric(
+        filename=None, include_mask=include_mask)
+    eval_results = eval_metric.predict_metric_fn(
+        predictions, groundtruth_data=dataset)
+  logging.info('Eval results: %s', eval_results)
+  return eval_results, predictions
+
+
+def evaluate(eval_estimator,
+             input_fn,
+             num_eval_samples,
+             eval_batch_size,
+             include_mask=True,
+             validation_json_file=None):
+  """Runs COCO evaluation once."""
+  predictor = eval_estimator.predict(
+      input_fn=input_fn, yield_single_examples=False)
+  # Every predictor.next() gets a batch of prediction (a dictionary).
+  num_eval_times = num_eval_samples // eval_batch_size
+  assert num_eval_times > 0, 'num_eval_samples >= eval_batch_size!'
+  eval_results, predictions = compute_coco_eval_metric(predictor,
+                                                       num_eval_times,
+                                                       include_mask,
+                                                       validation_json_file)
+  return eval_results, predictions
+
+
+def write_summary(eval_results, summary_writer, current_step, predictions=None):
+  """Write out eval results for the checkpoint."""
+  with tf.Graph().as_default():
+    summaries = []
+    for metric in eval_results:
+      summaries.append(
+          tf.Summary.Value(tag=metric, simple_value=eval_results[metric]))
+    tf_summary = tf.Summary(value=list(summaries))
+    summary_writer.add_summary(tf_summary, current_step)
+  write_image_summary(predictions, summary_writer, current_step)
+
+
+def create_image_summary(image,
+                         boxes,
+                         scores,
+                         classes,
+                         gt_boxes=None,
+                         segmentations=None):
+  """Creates an image summary given predictions."""
+  max_boxes_to_draw = 100
+  min_score_thresh = 0.1
+
+  # Visualizes the predicitons.
+  image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+      image,
+      boxes,
+      classes=classes,
+      scores=scores,
+      category_index={},
+      instance_masks=segmentations,
+      use_normalized_coordinates=False,
+      max_boxes_to_draw=max_boxes_to_draw,
+      min_score_thresh=min_score_thresh,
+      agnostic_mode=False)
+  if gt_boxes is not None:
+    # Visualizes the groundtruth boxes. They are in black by default.
+    image_with_detections = visualization_utils.visualize_boxes_and_labels_on_image_array(
+        image_with_detections,
+        gt_boxes,
+        classes=None,
+        scores=None,
+        category_index={},
+        use_normalized_coordinates=False,
+        max_boxes_to_draw=max_boxes_to_draw,
+        agnostic_mode=True)
+  buf = io.BytesIO()
+  w, h = image_with_detections.shape[:2]
+  ratio = 1024 / w
+  new_size = [int(w * ratio), int(h * ratio)]
+  image = Image.fromarray(image_with_detections.astype(np.uint8))
+  image.thumbnail(new_size)
+  image.save(buf, format='png')
+  image_summary = tf.Summary.Image(encoded_image_string=buf.getvalue())
+  return image_summary
+
+
+def write_image_summary(predictions, summary_writer, current_step):
+  """Write out image and prediction for summary."""
+  if not predictions or not isinstance(predictions, dict):
+    return
+  if 'orig_images' not in predictions:
+    logging.info('Missing orig_images in predictions: %s',
+                 predictions.keys())
+    return
+  predictions['orig_images'] = predictions['orig_images'] * 255
+  predictions['orig_images'] = predictions['orig_images'].astype(np.uint8)
+  num_images = predictions['orig_images'].shape[0]
+  include_mask = ('detection_masks' in predictions)
+
+  with tf.Graph().as_default():
+    summaries = []
+    for i in xrange(num_images):
+      num_detections = min(
+          len(predictions['detection_boxes'][i]),
+          int(predictions['num_detections'][i]))
+      detection_boxes = predictions['detection_boxes'][i][:num_detections]
+      selected_box_rois = predictions['selected_box_rois'][i][:num_detections]
+      detection_scores = predictions['detection_scores'][i][:num_detections]
+      detection_classes = predictions['detection_classes'][i][:num_detections]
+
+      image = predictions['orig_images'][i]
+      image_height = image.shape[0]
+      image_width = image.shape[1]
+
+      # Rescale the box to fit the visualization image.
+      h, w = predictions['image_info'][i][3:5]
+      detection_boxes = detection_boxes / np.array([w, h, w, h])
+      detection_boxes = detection_boxes * np.array(
+          [image_width, image_height, image_width, image_height])
+
+      selected_box_rois = selected_box_rois / np.array([w, h, w, h])
+      selected_box_rois = selected_box_rois * np.array(
+          [image_width, image_height, image_width, image_height])
+
+      gt_boxes = None
+      if 'groundtruth_boxes' in predictions:
+        gt_boxes = predictions['groundtruth_boxes'][i]
+        gt_boxes = gt_boxes * np.array(
+            [image_height, image_width, image_height, image_width])
+
+      segmentations = None
+      if include_mask:
+        instance_masks = predictions['detection_masks'][i][0:num_detections]
+        segmentations = coco_metric.generate_segmentation_from_masks(
+            instance_masks, selected_box_rois, image_height, image_width)
+
+      # From [x, y, w, h] to [x1, y1, x2, y2] and
+      # process_prediction_for_eval() set the box to be [x, y] format, need to
+      # reverted them to [y, x] format.
+      xmin, ymin, w, h = np.split(detection_boxes, 4, axis=-1)
+      xmax = xmin + w
+      ymax = ymin + h
+      boxes_to_visualize = np.concatenate([ymin, xmin, ymax, xmax], axis=-1)
+      image_summary = create_image_summary(
+          image,
+          boxes=boxes_to_visualize,
+          scores=detection_scores,
+          classes=detection_classes.astype(np.int32),
+          gt_boxes=gt_boxes,
+          segmentations=segmentations)
+      image_value = tf.Summary.Value(tag='%d_input' % i, image=image_summary)
+      summaries.append(image_value)
+    tf_summary = tf.Summary(value=list(summaries))
+    summary_writer.add_summary(tf_summary, current_step)
diff --git a/models/official/mask_rcnn/losses.py b/models/official/mask_rcnn/losses.py
index e6815ad7c..99652a75b 100644
--- a/models/official/mask_rcnn/losses.py
+++ b/models/official/mask_rcnn/losses.py
@@ -133,7 +133,30 @@ def _fast_rcnn_box_loss(box_outputs, box_targets, class_targets, normalizer=1.0,
     return box_loss
 
 
-def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets,
+def gIoU_loss(boxes1, boxes2,weight):
+    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
+    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
+    left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
+    right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
+
+    inter_section = tf.maximum(right_down - left_up, 0.0)
+    inter_area = inter_section[..., 0] * inter_section[..., 1]
+    union_area = boxes1_area + boxes2_area - inter_area
+    iou = inter_area / union_area
+
+    enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
+    enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
+    enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
+    enclose_area = enclose[..., 0] * enclose[..., 1]
+    giou = iou - 1.0 * (enclose_area - union_area) / enclose_area
+    mask = tf.cast(tf.greater(weight, 0),tf.float32)
+    mask = tf.stop_gradient(mask)
+
+    loss = (1 - giou)*mask
+    loss = tf.reduce_mean(loss)
+    return loss
+
+def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets, selected_class_targets,
                    params):
   """Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.
 
@@ -173,29 +196,10 @@ class and box losses from all levels.
     class_loss = _fast_rcnn_class_loss(
         class_outputs, class_targets_one_hot)
 
-    # Selects the box from `box_outputs` based on `class_targets`, with which
-    # the box has the maximum overlap.
-    batch_size, num_rois, _ = box_outputs.get_shape().as_list()
-    box_outputs = tf.reshape(box_outputs,
-                             [batch_size, num_rois, params['num_classes'], 4])
-
-    box_indices = tf.reshape(
-        class_targets + tf.tile(
-            tf.expand_dims(
-                tf.range(batch_size) * num_rois * params['num_classes'], 1),
-            [1, num_rois]) + tf.tile(
-                tf.expand_dims(tf.range(num_rois) * params['num_classes'], 0),
-                [batch_size, 1]), [-1])
-
-    box_outputs = tf.matmul(
-        tf.one_hot(
-            box_indices,
-            batch_size * num_rois * params['num_classes'],
-            dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
-    box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
-
-    box_loss = (params['fast_rcnn_box_loss_weight'] *
-                _fast_rcnn_box_loss(box_outputs, box_targets, class_targets))
+
+    box_loss = (0.1*params['fast_rcnn_box_loss_weight'] *_fast_rcnn_box_loss(box_outputs, box_targets, selected_class_targets))
+    # box_loss = params['fast_rcnn_box_loss_weight'] * gIoU_loss(box_outputs,box_targets,selected_class_targets)
+
     total_loss = class_loss + box_loss
     return total_loss, class_loss, box_loss
 
diff --git a/models/official/mask_rcnn/losses_back.py b/models/official/mask_rcnn/losses_back.py
new file mode 100644
index 000000000..e6815ad7c
--- /dev/null
+++ b/models/official/mask_rcnn/losses_back.py
@@ -0,0 +1,239 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses used for Mask-RCNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+
+def _rpn_score_loss(score_outputs, score_targets, normalizer=1.0):
+  """Computes score loss."""
+  # score_targets has three values: (1) score_targets[i]=1, the anchor is a
+  # positive sample. (2) score_targets[i]=0, negative. (3) score_targets[i]=-1,
+  # the anchor is don't care (ignore).
+  with tf.name_scope('rpn_score_loss'):
+    mask = tf.logical_or(tf.equal(score_targets, 1), tf.equal(score_targets, 0))
+    score_targets = tf.maximum(score_targets, tf.zeros_like(score_targets))
+    # RPN score loss is sum over all except ignored samples.
+    score_loss = tf.losses.sigmoid_cross_entropy(
+        score_targets, score_outputs, weights=mask,
+        reduction=tf.losses.Reduction.SUM)
+    score_loss /= normalizer
+    return score_loss
+
+
+def _rpn_box_loss(box_outputs, box_targets, normalizer=1.0, delta=1./9):
+  """Computes box regression loss."""
+  # delta is typically around the mean value of regression target.
+  # for instances, the regression targets of 512x512 input with 6 anchors on
+  # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+  with tf.name_scope('rpn_box_loss'):
+    mask = tf.not_equal(box_targets, 0.0)
+    # The loss is normalized by the sum of non-zero weights before additional
+    # normalizer provided by the function caller.
+    box_loss = tf.losses.huber_loss(
+        box_targets,
+        box_outputs,
+        weights=mask,
+        delta=delta,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    box_loss /= normalizer
+    return box_loss
+
+
+def rpn_loss(score_outputs, box_outputs, labels, params):
+  """Computes total RPN detection loss.
+
+  Computes total RPN detection loss including box and score from all levels.
+  Args:
+    score_outputs: an OrderDict with keys representing levels and values
+      representing scores in [batch_size, height, width, num_anchors].
+    box_outputs: an OrderDict with keys representing levels and values
+      representing box regression targets in
+      [batch_size, height, width, num_anchors * 4].
+    labels: the dictionary that returned from dataloader that includes
+      groundturth targets.
+    params: the dictionary including training parameters specified in
+      default_haprams function in this file.
+  Returns:
+    total_rpn_loss: a float tensor representing total loss reduced from
+      score and box losses from all levels.
+    rpn_score_loss: a float tensor representing total score loss.
+    rpn_box_loss: a float tensor representing total box regression loss.
+  """
+  with tf.name_scope('rpn_loss'):
+    levels = score_outputs.keys()
+
+    score_losses = []
+    box_losses = []
+    for level in levels:
+      score_targets_at_level = labels['score_targets_%d' % level]
+      box_targets_at_level = labels['box_targets_%d' % level]
+      score_losses.append(
+          _rpn_score_loss(
+              score_outputs[level],
+              score_targets_at_level,
+              normalizer=tf.to_float(
+                  params['batch_size'] * params['rpn_batch_size_per_im'])))
+      box_losses.append(
+          _rpn_box_loss(box_outputs[level], box_targets_at_level))
+
+    # Sum per level losses to total loss.
+    rpn_score_loss = tf.add_n(score_losses)
+    rpn_box_loss = params['rpn_box_loss_weight'] * tf.add_n(box_losses)
+    total_rpn_loss = rpn_score_loss + rpn_box_loss
+    return total_rpn_loss, rpn_score_loss, rpn_box_loss
+
+
+def _fast_rcnn_class_loss(class_outputs, class_targets_one_hot, normalizer=1.0):
+  """Computes classification loss."""
+  with tf.name_scope('fast_rcnn_class_loss'):
+    # The loss is normalized by the sum of non-zero weights before additional
+    # normalizer provided by the function caller.
+    class_loss = tf.losses.softmax_cross_entropy(
+        class_targets_one_hot, class_outputs,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    class_loss /= normalizer
+    return class_loss
+
+
+def _fast_rcnn_box_loss(box_outputs, box_targets, class_targets, normalizer=1.0,
+                        delta=1.):
+  """Computes box regression loss."""
+  # delta is typically around the mean value of regression target.
+  # for instances, the regression targets of 512x512 input with 6 anchors on
+  # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+  with tf.name_scope('fast_rcnn_box_loss'):
+    mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
+                   [1, 1, 4])
+    # The loss is normalized by the sum of non-zero weights before additional
+    # normalizer provided by the function caller.
+    box_loss = tf.losses.huber_loss(
+        box_targets,
+        box_outputs,
+        weights=mask,
+        delta=delta,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    box_loss /= normalizer
+    return box_loss
+
+
+def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets,
+                   params):
+  """Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.
+
+  This function implements the classification and box regression loss of the
+  Fast-RCNN branch in Mask-RCNN. As the `box_outputs` produces `num_classes`
+  boxes for each RoI, the reference model expands `box_targets` to match the
+  shape of `box_outputs` and selects only the target that the RoI has a maximum
+  overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py)  # pylint: disable=line-too-long
+  Instead, this function selects the `box_outputs` by the `class_targets` so
+  that it doesn't expand `box_targets`.
+
+  The loss computation has two parts: (1) classification loss is softmax on all
+  RoIs. (2) box loss is smooth L1-loss on only positive samples of RoIs.
+  Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
+
+
+  Args:
+    class_outputs: a float tensor representing the class prediction for each box
+      with a shape of [batch_size, num_boxes, num_classes].
+    box_outputs: a float tensor representing the box prediction for each box
+      with a shape of [batch_size, num_boxes, num_classes * 4].
+    class_targets: a float tensor representing the class label for each box
+      with a shape of [batch_size, num_boxes].
+    box_targets: a float tensor representing the box label for each box
+      with a shape of [batch_size, num_boxes, 4].
+    params: the dictionary including training parameters specified in
+      default_haprams function in this file.
+  Returns:
+    total_loss: a float tensor representing total loss reducing from
+      class and box losses from all levels.
+    cls_loss: a float tensor representing total class loss.
+    box_loss: a float tensor representing total box regression loss.
+  """
+  with tf.name_scope('fast_rcnn_loss'):
+    class_targets = tf.to_int32(class_targets)
+    class_targets_one_hot = tf.one_hot(class_targets, params['num_classes'])
+    class_loss = _fast_rcnn_class_loss(
+        class_outputs, class_targets_one_hot)
+
+    # Selects the box from `box_outputs` based on `class_targets`, with which
+    # the box has the maximum overlap.
+    batch_size, num_rois, _ = box_outputs.get_shape().as_list()
+    box_outputs = tf.reshape(box_outputs,
+                             [batch_size, num_rois, params['num_classes'], 4])
+
+    box_indices = tf.reshape(
+        class_targets + tf.tile(
+            tf.expand_dims(
+                tf.range(batch_size) * num_rois * params['num_classes'], 1),
+            [1, num_rois]) + tf.tile(
+                tf.expand_dims(tf.range(num_rois) * params['num_classes'], 0),
+                [batch_size, 1]), [-1])
+
+    box_outputs = tf.matmul(
+        tf.one_hot(
+            box_indices,
+            batch_size * num_rois * params['num_classes'],
+            dtype=box_outputs.dtype), tf.reshape(box_outputs, [-1, 4]))
+    box_outputs = tf.reshape(box_outputs, [batch_size, -1, 4])
+
+    box_loss = (params['fast_rcnn_box_loss_weight'] *
+                _fast_rcnn_box_loss(box_outputs, box_targets, class_targets))
+    total_loss = class_loss + box_loss
+    return total_loss, class_loss, box_loss
+
+
+def mask_rcnn_loss(mask_outputs, mask_targets, select_class_targets, params):
+  """Computes the mask loss of Mask-RCNN.
+
+  This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
+  produces `num_classes` masks for each RoI, the reference model expands
+  `mask_targets` to match the shape of `mask_outputs` and selects only the
+  target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py)  # pylint: disable=line-too-long
+  Instead, this implementation selects the `mask_outputs` by the `class_targets`
+  so that it doesn't expand `mask_targets`. Note that the selection logic is
+  done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
+
+  Args:
+    mask_outputs: a float tensor representing the prediction for each mask,
+      with a shape of
+      [batch_size, num_masks, mask_height, mask_width].
+    mask_targets: a float tensor representing the binary mask of ground truth
+      labels for each mask with a shape of
+      [batch_size, num_masks, mask_height, mask_width].
+    select_class_targets: a tensor with a shape of [batch_size, num_masks],
+      representing the foreground mask targets.
+    params: the dictionary including training parameters specified in
+      default_haprams function in this file.
+  Returns:
+    mask_loss: a float tensor representing total mask loss.
+  """
+  with tf.name_scope('mask_rcnn_loss'):
+    (batch_size, num_masks, mask_height,
+     mask_width) = mask_outputs.get_shape().as_list()
+
+    weights = tf.tile(
+        tf.reshape(tf.greater(select_class_targets, 0),
+                   [batch_size, num_masks, 1, 1]),
+        [1, 1, mask_height, mask_width])
+    loss = tf.losses.sigmoid_cross_entropy(
+        mask_targets, mask_outputs, weights=weights,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    return params['mrcnn_weight_loss_mask'] * loss
diff --git a/models/official/mask_rcnn/losses_new.py b/models/official/mask_rcnn/losses_new.py
new file mode 100644
index 000000000..99652a75b
--- /dev/null
+++ b/models/official/mask_rcnn/losses_new.py
@@ -0,0 +1,243 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Losses used for Mask-RCNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+
+def _rpn_score_loss(score_outputs, score_targets, normalizer=1.0):
+  """Computes score loss."""
+  # score_targets has three values: (1) score_targets[i]=1, the anchor is a
+  # positive sample. (2) score_targets[i]=0, negative. (3) score_targets[i]=-1,
+  # the anchor is don't care (ignore).
+  with tf.name_scope('rpn_score_loss'):
+    mask = tf.logical_or(tf.equal(score_targets, 1), tf.equal(score_targets, 0))
+    score_targets = tf.maximum(score_targets, tf.zeros_like(score_targets))
+    # RPN score loss is sum over all except ignored samples.
+    score_loss = tf.losses.sigmoid_cross_entropy(
+        score_targets, score_outputs, weights=mask,
+        reduction=tf.losses.Reduction.SUM)
+    score_loss /= normalizer
+    return score_loss
+
+
+def _rpn_box_loss(box_outputs, box_targets, normalizer=1.0, delta=1./9):
+  """Computes box regression loss."""
+  # delta is typically around the mean value of regression target.
+  # for instances, the regression targets of 512x512 input with 6 anchors on
+  # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+  with tf.name_scope('rpn_box_loss'):
+    mask = tf.not_equal(box_targets, 0.0)
+    # The loss is normalized by the sum of non-zero weights before additional
+    # normalizer provided by the function caller.
+    box_loss = tf.losses.huber_loss(
+        box_targets,
+        box_outputs,
+        weights=mask,
+        delta=delta,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    box_loss /= normalizer
+    return box_loss
+
+
+def rpn_loss(score_outputs, box_outputs, labels, params):
+  """Computes total RPN detection loss.
+
+  Computes total RPN detection loss including box and score from all levels.
+  Args:
+    score_outputs: an OrderDict with keys representing levels and values
+      representing scores in [batch_size, height, width, num_anchors].
+    box_outputs: an OrderDict with keys representing levels and values
+      representing box regression targets in
+      [batch_size, height, width, num_anchors * 4].
+    labels: the dictionary that returned from dataloader that includes
+      groundturth targets.
+    params: the dictionary including training parameters specified in
+      default_haprams function in this file.
+  Returns:
+    total_rpn_loss: a float tensor representing total loss reduced from
+      score and box losses from all levels.
+    rpn_score_loss: a float tensor representing total score loss.
+    rpn_box_loss: a float tensor representing total box regression loss.
+  """
+  with tf.name_scope('rpn_loss'):
+    levels = score_outputs.keys()
+
+    score_losses = []
+    box_losses = []
+    for level in levels:
+      score_targets_at_level = labels['score_targets_%d' % level]
+      box_targets_at_level = labels['box_targets_%d' % level]
+      score_losses.append(
+          _rpn_score_loss(
+              score_outputs[level],
+              score_targets_at_level,
+              normalizer=tf.to_float(
+                  params['batch_size'] * params['rpn_batch_size_per_im'])))
+      box_losses.append(
+          _rpn_box_loss(box_outputs[level], box_targets_at_level))
+
+    # Sum per level losses to total loss.
+    rpn_score_loss = tf.add_n(score_losses)
+    rpn_box_loss = params['rpn_box_loss_weight'] * tf.add_n(box_losses)
+    total_rpn_loss = rpn_score_loss + rpn_box_loss
+    return total_rpn_loss, rpn_score_loss, rpn_box_loss
+
+
+def _fast_rcnn_class_loss(class_outputs, class_targets_one_hot, normalizer=1.0):
+  """Computes classification loss."""
+  with tf.name_scope('fast_rcnn_class_loss'):
+    # The loss is normalized by the sum of non-zero weights before additional
+    # normalizer provided by the function caller.
+    class_loss = tf.losses.softmax_cross_entropy(
+        class_targets_one_hot, class_outputs,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    class_loss /= normalizer
+    return class_loss
+
+
+def _fast_rcnn_box_loss(box_outputs, box_targets, class_targets, normalizer=1.0,
+                        delta=1.):
+  """Computes box regression loss."""
+  # delta is typically around the mean value of regression target.
+  # for instances, the regression targets of 512x512 input with 6 anchors on
+  # P2-P6 pyramid is about [0.1, 0.1, 0.2, 0.2].
+  with tf.name_scope('fast_rcnn_box_loss'):
+    mask = tf.tile(tf.expand_dims(tf.greater(class_targets, 0), axis=2),
+                   [1, 1, 4])
+    # The loss is normalized by the sum of non-zero weights before additional
+    # normalizer provided by the function caller.
+    box_loss = tf.losses.huber_loss(
+        box_targets,
+        box_outputs,
+        weights=mask,
+        delta=delta,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    box_loss /= normalizer
+    return box_loss
+
+
+def gIoU_loss(boxes1, boxes2,weight):
+    boxes1_area = (boxes1[..., 2] - boxes1[..., 0]) * (boxes1[..., 3] - boxes1[..., 1])
+    boxes2_area = (boxes2[..., 2] - boxes2[..., 0]) * (boxes2[..., 3] - boxes2[..., 1])
+    left_up = tf.maximum(boxes1[..., :2], boxes2[..., :2])
+    right_down = tf.minimum(boxes1[..., 2:], boxes2[..., 2:])
+
+    inter_section = tf.maximum(right_down - left_up, 0.0)
+    inter_area = inter_section[..., 0] * inter_section[..., 1]
+    union_area = boxes1_area + boxes2_area - inter_area
+    iou = inter_area / union_area
+
+    enclose_left_up = tf.minimum(boxes1[..., :2], boxes2[..., :2])
+    enclose_right_down = tf.maximum(boxes1[..., 2:], boxes2[..., 2:])
+    enclose = tf.maximum(enclose_right_down - enclose_left_up, 0.0)
+    enclose_area = enclose[..., 0] * enclose[..., 1]
+    giou = iou - 1.0 * (enclose_area - union_area) / enclose_area
+    mask = tf.cast(tf.greater(weight, 0),tf.float32)
+    mask = tf.stop_gradient(mask)
+
+    loss = (1 - giou)*mask
+    loss = tf.reduce_mean(loss)
+    return loss
+
+def fast_rcnn_loss(class_outputs, box_outputs, class_targets, box_targets, selected_class_targets,
+                   params):
+  """Computes the box and class loss (Fast-RCNN branch) of Mask-RCNN.
+
+  This function implements the classification and box regression loss of the
+  Fast-RCNN branch in Mask-RCNN. As the `box_outputs` produces `num_classes`
+  boxes for each RoI, the reference model expands `box_targets` to match the
+  shape of `box_outputs` and selects only the target that the RoI has a maximum
+  overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/fast_rcnn.py)  # pylint: disable=line-too-long
+  Instead, this function selects the `box_outputs` by the `class_targets` so
+  that it doesn't expand `box_targets`.
+
+  The loss computation has two parts: (1) classification loss is softmax on all
+  RoIs. (2) box loss is smooth L1-loss on only positive samples of RoIs.
+  Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/fast_rcnn_heads.py  # pylint: disable=line-too-long
+
+
+  Args:
+    class_outputs: a float tensor representing the class prediction for each box
+      with a shape of [batch_size, num_boxes, num_classes].
+    box_outputs: a float tensor representing the box prediction for each box
+      with a shape of [batch_size, num_boxes, num_classes * 4].
+    class_targets: a float tensor representing the class label for each box
+      with a shape of [batch_size, num_boxes].
+    box_targets: a float tensor representing the box label for each box
+      with a shape of [batch_size, num_boxes, 4].
+    params: the dictionary including training parameters specified in
+      default_haprams function in this file.
+  Returns:
+    total_loss: a float tensor representing total loss reducing from
+      class and box losses from all levels.
+    cls_loss: a float tensor representing total class loss.
+    box_loss: a float tensor representing total box regression loss.
+  """
+  with tf.name_scope('fast_rcnn_loss'):
+    class_targets = tf.to_int32(class_targets)
+    class_targets_one_hot = tf.one_hot(class_targets, params['num_classes'])
+    class_loss = _fast_rcnn_class_loss(
+        class_outputs, class_targets_one_hot)
+
+
+    box_loss = (0.1*params['fast_rcnn_box_loss_weight'] *_fast_rcnn_box_loss(box_outputs, box_targets, selected_class_targets))
+    # box_loss = params['fast_rcnn_box_loss_weight'] * gIoU_loss(box_outputs,box_targets,selected_class_targets)
+
+    total_loss = class_loss + box_loss
+    return total_loss, class_loss, box_loss
+
+
+def mask_rcnn_loss(mask_outputs, mask_targets, select_class_targets, params):
+  """Computes the mask loss of Mask-RCNN.
+
+  This function implements the mask loss of Mask-RCNN. As the `mask_outputs`
+  produces `num_classes` masks for each RoI, the reference model expands
+  `mask_targets` to match the shape of `mask_outputs` and selects only the
+  target that the RoI has a maximum overlap. (Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/roi_data/mask_rcnn.py)  # pylint: disable=line-too-long
+  Instead, this implementation selects the `mask_outputs` by the `class_targets`
+  so that it doesn't expand `mask_targets`. Note that the selection logic is
+  done in the post-processing of mask_rcnn_fn in mask_rcnn_architecture.py.
+
+  Args:
+    mask_outputs: a float tensor representing the prediction for each mask,
+      with a shape of
+      [batch_size, num_masks, mask_height, mask_width].
+    mask_targets: a float tensor representing the binary mask of ground truth
+      labels for each mask with a shape of
+      [batch_size, num_masks, mask_height, mask_width].
+    select_class_targets: a tensor with a shape of [batch_size, num_masks],
+      representing the foreground mask targets.
+    params: the dictionary including training parameters specified in
+      default_haprams function in this file.
+  Returns:
+    mask_loss: a float tensor representing total mask loss.
+  """
+  with tf.name_scope('mask_rcnn_loss'):
+    (batch_size, num_masks, mask_height,
+     mask_width) = mask_outputs.get_shape().as_list()
+
+    weights = tf.tile(
+        tf.reshape(tf.greater(select_class_targets, 0),
+                   [batch_size, num_masks, 1, 1]),
+        [1, 1, mask_height, mask_width])
+    loss = tf.losses.sigmoid_cross_entropy(
+        mask_targets, mask_outputs, weights=weights,
+        reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
+    return params['mrcnn_weight_loss_mask'] * loss
diff --git a/models/official/mask_rcnn/mask_rcnn_model.py b/models/official/mask_rcnn/mask_rcnn_model.py
index 3c221169e..f6517f4f9 100644
--- a/models/official/mask_rcnn/mask_rcnn_model.py
+++ b/models/official/mask_rcnn/mask_rcnn_model.py
@@ -44,642 +44,755 @@
 import spatial_transform_ops
 import training_ops
 import sys
+
 sys.path.append('tpu/models/official/mnasnet')
-import mnasnet_models
+
+
+# import mnasnet_models
 
 
 def create_optimizer(learning_rate, params):
-  """Creates optimized based on the specified flags."""
-  if params['optimizer'] == 'momentum':
-    optimizer = tf.train.MomentumOptimizer(
-        learning_rate, momentum=params['momentum'])
-  elif params['optimizer'] == 'adam':
-    optimizer = tf.train.AdamOptimizer(learning_rate)
-  elif params['optimizer'] == 'adadelta':
-    optimizer = tf.train.AdadeltaOptimizer(learning_rate)
-  elif params['optimizer'] == 'adagrad':
-    optimizer = tf.train.AdagradOptimizer(learning_rate)
-  elif params['optimizer'] == 'rmsprop':
-    optimizer = tf.train.RMSPropOptimizer(
-        learning_rate, momentum=params['momentum'])
-  elif params['optimizer'] == 'lars':
-    try:
-      from tensorflow.contrib.opt import LARSOptimizer  # pylint: disable=g-import-not-at-top
-
-      optimizer = LARSOptimizer(
-          learning_rate,
-          momentum=params['momentum'],
-          weight_decay=params['lars_weight_decay'],
-          skip_list=['batch_normalization', 'bias'])
-    except ImportError as e:
-      logging.exception('LARSOptimizer is currently not supported '
-                        'in TensorFlow 2.x.')
-      raise e
-
-  else:
-    raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
-  return optimizer
+    """Creates optimized based on the specified flags."""
+    if params['optimizer'] == 'momentum':
+        optimizer = tf.train.MomentumOptimizer(
+            learning_rate, momentum=params['momentum'])
+    elif params['optimizer'] == 'adam':
+        optimizer = tf.train.AdamOptimizer(learning_rate)
+    elif params['optimizer'] == 'adadelta':
+        optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+    elif params['optimizer'] == 'adagrad':
+        optimizer = tf.train.AdagradOptimizer(learning_rate)
+    elif params['optimizer'] == 'rmsprop':
+        optimizer = tf.train.RMSPropOptimizer(
+            learning_rate, momentum=params['momentum'])
+    elif params['optimizer'] == 'lars':
+        try:
+            from tensorflow.contrib.opt import LARSOptimizer  # pylint: disable=g-import-not-at-top
+
+            optimizer = LARSOptimizer(
+                learning_rate,
+                momentum=params['momentum'],
+                weight_decay=params['lars_weight_decay'],
+                skip_list=['batch_normalization', 'bias'])
+        except ImportError as e:
+            logging.exception('LARSOptimizer is currently not supported '
+                              'in TensorFlow 2.x.')
+            raise e
+
+    else:
+        raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
+    return optimizer
 
 
 def remove_variables(variables, prefix):
-  """Removes low-level variables from the input.
+    """Removes low-level variables from the input.
 
-  Removing low-level parameters (e.g., initial convolution layer) from training
-  usually leads to higher training speed and slightly better testing accuracy.
-  The intuition is that the low-level architecture (e.g., ResNet-50) is able to
-  capture low-level features such as edges; therefore, it does not need to be
-  fine-tuned for the detection task.
+    Removing low-level parameters (e.g., initial convolution layer) from training
+    usually leads to higher training speed and slightly better testing accuracy.
+    The intuition is that the low-level architecture (e.g., ResNet-50) is able to
+    capture low-level features such as edges; therefore, it does not need to be
+    fine-tuned for the detection task.
 
-  Args:
-    variables: all the variables in training
-    prefix: prefix of backbone
+    Args:
+      variables: all the variables in training
+      prefix: prefix of backbone
 
-  Returns:
-    var_list: a list containing variables for training
+    Returns:
+      var_list: a list containing variables for training
 
-  """
-  # Freeze at conv2 based on reference model.
-  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194  # pylint: disable=line-too-long
-  remove_list = []
-  remove_list.append(prefix + 'conv2d/')
-  remove_list.append(prefix + 'batch_normalization/')
-  for i in range(1, 11):
-    remove_list.append(prefix + 'conv2d_{}/'.format(i))
-    remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
+    """
+    # Freeze at conv2 based on reference model.
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194  # pylint: disable=line-too-long
+    remove_list = []
+    remove_list.append(prefix + 'conv2d/')
+    remove_list.append(prefix + 'batch_normalization/')
+    for i in range(1, 11):
+        remove_list.append(prefix + 'conv2d_{}/'.format(i))
+        remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
 
-  def _is_kept(variable):
-    for rm_str in remove_list:
-      if rm_str in variable.name:
-        return False
-    return True
+    def _is_kept(variable):
+        for rm_str in remove_list:
+            if rm_str in variable.name:
+                return False
+        return True
 
-  var_list = [v for v in variables if _is_kept(v)]
-  return var_list
+    var_list = [v for v in variables if _is_kept(v)]
+    return var_list
 
 
 def compute_model_statistics(batch_size, is_training=True):
-  """Compute number of parameters and FLOPS."""
-  num_trainable_params = np.sum(
-      [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
-  logging.info('number of trainable params: %d', num_trainable_params)
-
-  options = tf.profiler.ProfileOptionBuilder.float_operation()
-  options['output'] = 'none'
-  flops = tf.profiler.profile(
-      tf.get_default_graph(), options=options).total_float_ops
-  flops_per_image = flops / batch_size
-  if is_training:
-    logging.info(
-        'number of FLOPS per image: %f in training', flops_per_image)
-  else:
-    logging.info(
-        'number of FLOPS per image: %f in eval', flops_per_image)
+    """Compute number of parameters and FLOPS."""
+    num_trainable_params = np.sum(
+        [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
+    logging.info('number of trainable params: %d', num_trainable_params)
+
+    options = tf.profiler.ProfileOptionBuilder.float_operation()
+    options['output'] = 'none'
+    flops = tf.profiler.profile(
+        tf.get_default_graph(), options=options).total_float_ops
+    flops_per_image = flops / batch_size
+    if is_training:
+        logging.info(
+            'number of FLOPS per image: %f in training', flops_per_image)
+    else:
+        logging.info(
+            'number of FLOPS per image: %f in eval', flops_per_image)
+
+
+def build_box_outputs(mask, params):
+    with tf.tpu.bfloat16_scope():
+        with tf.variable_scope('bbox_head_by_mask'):
+
+            def build_transform_variable(l_r, min_cost, image_size):
+                variable = []
+                if l_r == "right":
+                    l_v = min_cost
+                    r_v = 0
+                elif l_r == "left":
+                    l_v = 0
+                    r_v = min_cost
+                else:
+                    raise ValueError('l_r must be left or right')
+
+                for i in range(image_size):
+                    row = []
+                    for k in range(i):
+                        row.append(l_v)
+                    row.append(1)
+                    for k in range(image_size - 1 - i):
+                        row.append(r_v)
+                    variable.append(row)
+
+                return variable
+
+            def cal_offset(scope, input, img_size, alpha=1e-4):
+                with tf.variable_scope(scope):
+                    c_left = tf.constant(build_transform_variable("left", -img_size * 2., img_size), dtype=tf.float32)
+                    c_right = tf.constant(build_transform_variable("right", -img_size * 2., img_size), dtype=tf.float32)
+                    if params['precision'] == 'bfloat16':
+                        c_left = tf.cast(c_left, tf.bfloat16)
+                        c_right = tf.cast(c_right, tf.bfloat16)
+                    with tf.variable_scope("mask"):
+                        if params['precision'] == 'bfloat16':
+                            net = tf.cast(tf.greater(input, alpha), tf.bfloat16)
+                        else:
+                            net = tf.cast(tf.greater(input, alpha), tf.float32)
+
+                        # for left
+                        net_left = tf.nn.relu(tf.matmul(net, c_left))
+                        mask_left = tf.stop_gradient(net_left, name="mask_left")
+
+                        # for right
+                        net_right = tf.nn.relu(tf.matmul(net, c_right))
+                        mask_right = tf.stop_gradient(net_right, name="mask")
+
+                    with tf.variable_scope("work"):
+                        offset_left = 1 - input
+                        left = offset_left + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+                        left = tf.reduce_sum(left * mask_left, axis=-1)
+
+                        right = input + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+                        right = tf.reduce_sum(right * mask_right, axis=-1)
+
+                return left, right, [net_right, input]
+
+            batch_size, num_boxes, img_size, _ = mask.get_shape().as_list()
+            mask = tf.reshape(mask, [-1, img_size, img_size])
+            mask = tf.nn.dropout(mask, keep_prob=0.98)
+            mask = tf.clip_by_value(mask, 0, 1)
+            row = tf.reduce_max(mask, axis=2)  # h
+            col = tf.reduce_max(mask, axis=1)  # w
+            row_l, row_r, debug_row = cal_offset("cal_offset_row", row, img_size)
+            col_l, col_r, debug_col = cal_offset("cal_offset_col", col, img_size)
+            bbox = tf.stack([row_l, col_l, row_r, col_r], axis=-1)
+            bbox = tf.reshape(bbox, [batch_size, num_boxes, 4])
+
+    return bbox, img_size  # y1,x1,y2,x2
 
 
 def build_model_graph(features, labels, is_training, params):
-  """Builds the forward model graph."""
-  use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
-  is_gpu_inference = (not is_training and use_batched_nms)
-  model_outputs = {}
-
-  if is_training and params['transpose_input']:
-    if (params['backbone'].startswith('resnet') and
-        params['conv0_space_to_depth_block_size'] > 0):
-      features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+    """Builds the forward model graph."""
+    use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
+    is_gpu_inference = (not is_training and use_batched_nms)
+    model_outputs = {}
+
+    if is_training and params['transpose_input']:
+        if (params['backbone'].startswith('resnet') and
+                params['conv0_space_to_depth_block_size'] > 0):
+            features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+        else:
+            features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
+
+    batch_size, image_height, image_width, _ = (
+        features['images'].get_shape().as_list())
+
+    conv0_space_to_depth_block_size = 0
+    if (is_training and
+            (params['backbone'].startswith('resnet') and
+             params['conv0_space_to_depth_block_size'] > 0)):
+        conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
+        image_height *= conv0_space_to_depth_block_size
+        image_width *= conv0_space_to_depth_block_size
+
+    if 'source_ids' not in features:
+        features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
+
+    all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
+                                  params['num_scales'], params['aspect_ratios'],
+                                  params['anchor_scale'],
+                                  (image_height, image_width))
+
+    if 'resnet' in params['backbone']:
+        with tf.variable_scope(params['backbone']):
+            resnet_fn = resnet.resnet_v1(
+                params['backbone'],
+                conv0_kernel_size=params['conv0_kernel_size'],
+                conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
+                num_batch_norm_group=params['num_batch_norm_group'])
+            backbone_feats = resnet_fn(
+                features['images'],
+                (params['is_training_bn'] and is_training))
+    elif 'mnasnet' in params['backbone']:
+        with tf.variable_scope(params['backbone']):
+            _, endpoints = mnasnet_models.build_mnasnet_base(
+                features['images'],
+                params['backbone'],
+                training=(params['is_training_bn'] and is_training),
+                override_params={'use_keras': False})
+
+            backbone_feats = {
+                2: endpoints['reduction_2'],
+                3: endpoints['reduction_3'],
+                4: endpoints['reduction_4'],
+                5: endpoints['reduction_5'],
+            }
     else:
-      features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
-
-  batch_size, image_height, image_width, _ = (
-      features['images'].get_shape().as_list())
-
-  conv0_space_to_depth_block_size = 0
-  if (is_training and
-      (params['backbone'].startswith('resnet') and
-       params['conv0_space_to_depth_block_size'] > 0)):
-    conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
-    image_height *= conv0_space_to_depth_block_size
-    image_width *= conv0_space_to_depth_block_size
-
-  if 'source_ids' not in features:
-    features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
-
-  all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
-                                params['num_scales'], params['aspect_ratios'],
-                                params['anchor_scale'],
-                                (image_height, image_width))
-
-  if 'resnet' in params['backbone']:
-    with tf.variable_scope(params['backbone']):
-      resnet_fn = resnet.resnet_v1(
-          params['backbone'],
-          conv0_kernel_size=params['conv0_kernel_size'],
-          conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
-          num_batch_norm_group=params['num_batch_norm_group'])
-      backbone_feats = resnet_fn(
-          features['images'],
-          (params['is_training_bn'] and is_training))
-  elif 'mnasnet' in params['backbone']:
-    with tf.variable_scope(params['backbone']):
-      _, endpoints = mnasnet_models.build_mnasnet_base(
-          features['images'],
-          params['backbone'],
-          training=(params['is_training_bn'] and is_training),
-          override_params={'use_keras': False})
-
-      backbone_feats = {
-          2: endpoints['reduction_2'],
-          3: endpoints['reduction_3'],
-          4: endpoints['reduction_4'],
-          5: endpoints['reduction_5'],
-      }
-  else:
-    raise ValueError('Not a valid backbone option: %s' % params['backbone'])
-
-  fpn_feats = fpn.fpn(
-      backbone_feats, params['min_level'], params['max_level'])
-  model_outputs.update({
-      'fpn_features': fpn_feats,
-  })
-
-  rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
-      fpn_feats,
-      params['min_level'], params['max_level'],
-      len(params['aspect_ratios'] * params['num_scales']))
-
-  if is_training:
-    rpn_pre_nms_topn = params['rpn_pre_nms_topn']
-    rpn_post_nms_topn = params['rpn_post_nms_topn']
-  else:
-    rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
-    rpn_post_nms_topn = params['test_rpn_post_nms_topn']
-
-  rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
-      rpn_score_outputs,
-      rpn_box_outputs,
-      all_anchors,
-      features['image_info'],
-      rpn_pre_nms_topn,
-      rpn_post_nms_topn,
-      params['rpn_nms_threshold'],
-      params['rpn_min_size'],
-      bbox_reg_weights=None,
-      use_batched_nms=use_batched_nms)
-  rpn_box_rois = tf.to_float(rpn_box_rois)
-  if is_training:
-    rpn_box_rois = tf.stop_gradient(rpn_box_rois)
-    rpn_box_scores = tf.stop_gradient(rpn_box_scores)
-
-  if is_training:
-    # Sampling
-    box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
-        training_ops.proposal_label_op(
-            rpn_box_rois,
-            labels['gt_boxes'],
-            labels['gt_classes'],
-            batch_size_per_im=params['batch_size_per_im'],
-            fg_fraction=params['fg_fraction'],
-            fg_thresh=params['fg_thresh'],
-            bg_thresh_hi=params['bg_thresh_hi'],
-            bg_thresh_lo=params['bg_thresh_lo']))
-
-  # Performs multi-level RoIAlign.
-  box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-      fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
-
-  class_outputs, box_outputs, _ = heads.box_head(
-      box_roi_features, num_classes=params['num_classes'],
-      mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
-
-  if not is_training:
-    if is_gpu_inference:
-      generate_detections_fn = postprocess_ops.generate_detections_gpu
-    else:
-      generate_detections_fn = postprocess_ops.generate_detections_tpu
-    detections = generate_detections_fn(
-        class_outputs,
-        box_outputs,
-        rpn_box_rois,
-        features['image_info'],
-        params['test_rpn_post_nms_topn'],
-        params['test_detections_per_image'],
-        params['test_nms'],
-        params['bbox_reg_weights'])
+        raise ValueError('Not a valid backbone option: %s' % params['backbone'])
 
+    fpn_feats = fpn.fpn(
+        backbone_feats, params['min_level'], params['max_level'])
     model_outputs.update({
-        'num_detections': detections[0],
-        'detection_boxes': detections[1],
-        'detection_classes': detections[2],
-        'detection_scores': detections[3],
-    })
-  else:
-    encoded_box_targets = training_ops.encode_box_targets(
-        rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
-    model_outputs.update({
-        'rpn_score_outputs': rpn_score_outputs,
-        'rpn_box_outputs': rpn_box_outputs,
-        'class_outputs': class_outputs,
-        'box_outputs': box_outputs,
-        'class_targets': class_targets,
-        'box_targets': encoded_box_targets,
-        'box_rois': rpn_box_rois,
+        'fpn_features': fpn_feats,
     })
 
-  # Faster-RCNN mode.
-  if not params['include_mask']:
+    rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
+        fpn_feats,
+        params['min_level'], params['max_level'],
+        len(params['aspect_ratios'] * params['num_scales']))
+
+    if is_training:
+        rpn_pre_nms_topn = params['rpn_pre_nms_topn']
+        rpn_post_nms_topn = params['rpn_post_nms_topn']
+    else:
+        rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
+        rpn_post_nms_topn = params['test_rpn_post_nms_topn']
+
+    # rpn_box_rois: [ymin, xmin, ymax, xmax]
+    rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
+        rpn_score_outputs,
+        rpn_box_outputs,
+        all_anchors,
+        features['image_info'],
+        rpn_pre_nms_topn,
+        rpn_post_nms_topn,
+        params['rpn_nms_threshold'],
+        params['rpn_min_size'],
+        bbox_reg_weights=None,
+        use_batched_nms=use_batched_nms)
+    rpn_box_rois = tf.to_float(rpn_box_rois)
+    if is_training:
+        rpn_box_rois = tf.stop_gradient(rpn_box_rois)
+        rpn_box_scores = tf.stop_gradient(rpn_box_scores)
+
+    if is_training:
+        # Sampling
+        box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
+            training_ops.proposal_label_op(
+                rpn_box_rois,
+                labels['gt_boxes'],
+                labels['gt_classes'],
+                batch_size_per_im=params['batch_size_per_im'],
+                fg_fraction=params['fg_fraction'],
+                fg_thresh=params['fg_thresh'],
+                bg_thresh_hi=params['bg_thresh_hi'],
+                bg_thresh_lo=params['bg_thresh_lo']))
+
+    # Performs multi-level RoIAlign.
+    box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
+
+    class_outputs, _, _ = heads.box_head(
+        box_roi_features, num_classes=params['num_classes'],
+        mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
+
+    if not is_training:
+        if is_gpu_inference:
+            generate_detections_fn = postprocess_ops.generate_detections_gpu
+        else:
+            generate_detections_fn = postprocess_ops.generate_detections_tpu
+        detections = generate_detections_fn(
+            class_outputs,
+            rpn_box_rois,
+            features['image_info'],
+            params['test_rpn_post_nms_topn'],
+            params['test_detections_per_image'],
+            params['test_nms'],
+            params['bbox_reg_weights'])
+
+        model_outputs.update({
+            'num_detections': detections[0],
+            'detection_boxes': detections[1],
+            'detection_classes': detections[2],
+            'detection_scores': detections[3],
+        })
+    else:
+        # encoded_box_targets = training_ops.encode_box_targets(
+        #     rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
+        model_outputs.update({
+            'rpn_score_outputs': rpn_score_outputs,
+            'rpn_box_outputs': rpn_box_outputs,
+            'class_outputs': class_outputs,
+            'class_targets': class_targets,
+            # 'box_outputs': box_outputs,
+            # 'box_targets': box_targets,
+            # 'box_rois': rpn_box_rois,
+        })
+
+    # # Faster-RCNN mode.
+    # if not params['include_mask']:
+    #   # Print #parameters and #FLOPs in model.
+    #   compute_model_statistics(batch_size, is_training=is_training)
+    #
+    #   return model_outputs
+
+    def expand_boxes(boxes, scale):
+        # whereas `boxes` here is in [y1, x1, y2, x2] form
+        w_half = (boxes[..., 3] - boxes[..., 1]) * .5
+        h_half = (boxes[..., 2] - boxes[..., 0]) * .5
+        x_c = boxes[..., 1] + w_half
+        y_c = boxes[..., 0] + h_half
+
+        w_half *= scale
+        h_half *= scale
+
+        boxes_exp = tf.stack([y_c - h_half, x_c - w_half, y_c + h_half, x_c + w_half], axis=-1)
+
+        return boxes_exp
+
+    # Mask sampling
+    if not is_training:
+        selected_box_rois = model_outputs['detection_boxes']
+        class_indices = model_outputs['detection_classes']
+        # If using GPU for inference, delay the cast until when Gather ops show up
+        # since GPU inference supports float point better.
+        # TODO(laigd): revisit this when newer versions of GPU libraries is
+        # released.
+        selected_box_rois = expand_boxes(selected_box_rois, 1.2)
+        if not is_gpu_inference:
+            class_indices = tf.to_int32(class_indices)
+    else:
+        (selected_class_targets, selected_box_targets, selected_box_rois,
+         proposal_to_label_map) = (
+            training_ops.select_fg_for_masks(
+                class_targets, box_targets, rpn_box_rois,
+                proposal_to_label_map,
+                max_num_fg=int(
+                    params['batch_size_per_im'] * params['fg_fraction'])))
+
+        selected_box_rois = expand_boxes(selected_box_targets, 1.2)
+        class_indices = tf.to_int32(selected_class_targets)
+
+
+    import box_utils
+    selected_box_rois = box_utils.clip_boxes(selected_box_rois, 1024, 1024)
+    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_feats,
+        selected_box_rois,
+        output_size=14,
+        is_gpu_inference=is_gpu_inference)
+    mask_outputs = heads.mask_head(
+        mask_roi_features,
+        class_indices,
+        num_classes=params['num_classes'],
+        mrcnn_resolution=params['mrcnn_resolution'],
+        is_gpu_inference=is_gpu_inference)
+
+    soft_mask_outputs = tf.nn.sigmoid(mask_outputs)
+    box_outputs_in_mak, image_size = build_box_outputs(soft_mask_outputs, params)
+    offset = tf.stack(
+        [selected_box_rois[..., 0], selected_box_rois[..., 1], selected_box_rois[..., 0], selected_box_rois[..., 1]],
+        axis=-1)
+    h_scale = (selected_box_rois[..., 2] - selected_box_rois[..., 0]) / image_size
+    w_scale = (selected_box_rois[..., 3] - selected_box_rois[..., 1]) / image_size
+    wh_scale = tf.stack([h_scale, w_scale, h_scale, w_scale], axis=-1)
+    if params['precision'] == 'bfloat16':
+        wh_scale = tf.cast(wh_scale, tf.bfloat16)
+        offset = tf.cast(offset, tf.bfloat16)
+
+    box_outputs = offset + box_outputs_in_mak * wh_scale
+
     # Print #parameters and #FLOPs in model.
     compute_model_statistics(batch_size, is_training=is_training)
 
-    return model_outputs
-
-  # Mask sampling
-  if not is_training:
-    selected_box_rois = model_outputs['detection_boxes']
-    class_indices = model_outputs['detection_classes']
-    # If using GPU for inference, delay the cast until when Gather ops show up
-    # since GPU inference supports float point better.
-    # TODO(laigd): revisit this when newer versions of GPU libraries is
-    # released.
-    if not is_gpu_inference:
-      class_indices = tf.to_int32(class_indices)
-  else:
-    (selected_class_targets, selected_box_targets, selected_box_rois,
-     proposal_to_label_map) = (
-         training_ops.select_fg_for_masks(
-             class_targets, box_targets, rpn_box_rois,
-             proposal_to_label_map,
-             max_num_fg=int(
-                 params['batch_size_per_im'] * params['fg_fraction'])))
-    class_indices = tf.to_int32(selected_class_targets)
-
-  mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
-      fpn_feats,
-      selected_box_rois,
-      output_size=14,
-      is_gpu_inference=is_gpu_inference)
-  mask_outputs = heads.mask_head(
-      mask_roi_features,
-      class_indices,
-      num_classes=params['num_classes'],
-      mrcnn_resolution=params['mrcnn_resolution'],
-      is_gpu_inference=is_gpu_inference)
-
-  # Print #parameters and #FLOPs in model.
-  compute_model_statistics(batch_size, is_training=is_training)
-
-  if is_training:
-    mask_targets = training_ops.get_mask_targets(
-        selected_box_rois, proposal_to_label_map, selected_box_targets,
-        labels['cropped_gt_masks'], params['mrcnn_resolution'])
-    model_outputs.update({
-        'mask_outputs': mask_outputs,
-        'mask_targets': mask_targets,
-        'selected_class_targets': selected_class_targets,
-    })
-  else:
-    model_outputs.update({
-        'detection_masks': tf.nn.sigmoid(mask_outputs),
-    })
+    if is_training:
+        mask_targets = training_ops.get_mask_targets(
+            selected_box_rois, proposal_to_label_map, selected_box_targets,
+            labels['cropped_gt_masks'], params['mrcnn_resolution'])
+        model_outputs.update({
+            'mask_outputs': mask_outputs,
+            'mask_targets': mask_targets,
+            'selected_class_targets': selected_class_targets,
+            'box_outputs': box_outputs,
+            'box_targets': selected_box_targets,
+        })
+    else:
+        model_outputs.update({
+            'detection_masks': soft_mask_outputs,
+            'detection_boxes': box_outputs,
+            'selected_box_rois': selected_box_rois
+        })
 
-  return model_outputs
+    return model_outputs
 
 
 def _build_assigment_map(optimizer, prefix=None, skip_variables_regex=None):
-  """Generate assigment map for loading checkpoints."""
-  optimizer_vars = set([var.name for var in optimizer.variables()])
-  all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
-  if not prefix:
-    prefix = ''
-  assignment_map = {}
-  for var in all_vars:
-    if var.name not in optimizer_vars:
-      var_name = var.name
-      # Trim the index of the variable.
-      if ':' in var_name:
-        var_name = var_name[:var_name.rindex(':')]
-      if skip_variables_regex and re.match(skip_variables_regex,
-                                           var_name[len(prefix):]):
-        continue
-      assignment_map[var_name[len(prefix):]] = var
-  return assignment_map
+    """Generate assigment map for loading checkpoints."""
+    optimizer_vars = set([var.name for var in optimizer.variables()])
+    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
+    if not prefix:
+        prefix = ''
+    assignment_map = {}
+    for var in all_vars:
+        if var.name not in optimizer_vars:
+            var_name = var.name
+            # Trim the index of the variable.
+            if ':' in var_name:
+                var_name = var_name[:var_name.rindex(':')]
+            if skip_variables_regex and re.match(skip_variables_regex,
+                                                 var_name[len(prefix):]):
+                continue
+            assignment_map[var_name[len(prefix):]] = var
+    return assignment_map
 
 
 def _model_fn(features, labels, mode, params, variable_filter_fn=None):
-  """Model defination for the Mask-RCNN model based on ResNet.
-
-  Args:
-    features: the input image tensor and auxiliary information, such as
-      `image_info` and `source_ids`. The image tensor has a shape of
-      [batch_size, height, width, 3]. The height and width are fixed and equal.
-    labels: the input labels in a dictionary. The labels include score targets
-      and box targets which are dense label maps. The labels are generated from
-      get_input_fn function in data/dataloader.py
-    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
-    params: the dictionary defines hyperparameters of model. The default
-      settings are in default_hparams function in this file.
-    variable_filter_fn: the filter function that takes trainable_variables and
-      returns the variable list after applying the filter rule.
-
-  Returns:
-    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
-  """
-  if (mode == tf.estimator.ModeKeys.PREDICT or
-      mode == tf.estimator.ModeKeys.EVAL):
-    if ((params['include_groundtruth_in_features'] or
-         mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
-      # In include groundtruth for eval.
-      labels = features['labels']
-
-    if 'features' in features:
-      features = features['features']
-      # Otherwise, it is in export mode, the features is past in directly.
-
-  if params['precision'] == 'bfloat16':
-    with tf.tpu.bfloat16_scope():
-      model_outputs = build_model_graph(features, labels,
-                                        mode == tf.estimator.ModeKeys.TRAIN,
-                                        params)
-      model_outputs.update({
-          'source_id': features['source_ids'],
-          'image_info': features['image_info'],
-      })
-      def cast_outputs_to_float(d):
-        for k, v in sorted(six.iteritems(d)):
-          if isinstance(v, dict):
-            cast_outputs_to_float(v)
-          else:
-            d[k] = tf.cast(v, tf.float32)
-      cast_outputs_to_float(model_outputs)
-  else:
-    model_outputs = build_model_graph(features, labels,
-                                      mode == tf.estimator.ModeKeys.TRAIN,
-                                      params)
-    model_outputs.update({
-        'source_id': features['source_ids'],
-        'image_info': features['image_info'],
-    })
+    """Model defination for the Mask-RCNN model based on ResNet.
+
+    Args:
+      features: the input image tensor and auxiliary information, such as
+        `image_info` and `source_ids`. The image tensor has a shape of
+        [batch_size, height, width, 3]. The height and width are fixed and equal.
+      labels: the input labels in a dictionary. The labels include score targets
+        and box targets which are dense label maps. The labels are generated from
+        get_input_fn function in data/dataloader.py
+      mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+      params: the dictionary defines hyperparameters of model. The default
+        settings are in default_hparams function in this file.
+      variable_filter_fn: the filter function that takes trainable_variables and
+        returns the variable list after applying the filter rule.
+
+    Returns:
+      tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
+    """
+    if (mode == tf.estimator.ModeKeys.PREDICT or
+            mode == tf.estimator.ModeKeys.EVAL):
+        if ((params['include_groundtruth_in_features'] or
+             mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
+            # In include groundtruth for eval.
+            labels = features['labels']
+
+        if 'features' in features:
+            features = features['features']
+            # Otherwise, it is in export mode, the features is past in directly.
+
+    if params['precision'] == 'bfloat16':
+        with tf.tpu.bfloat16_scope():
+            model_outputs = build_model_graph(features, labels,
+                                              mode == tf.estimator.ModeKeys.TRAIN,
+                                              params)
+            model_outputs.update({
+                'source_id': features['source_ids'],
+                'image_info': features['image_info'],
+            })
+
+            def cast_outputs_to_float(d):
+                for k, v in sorted(six.iteritems(d)):
+                    if isinstance(v, dict):
+                        cast_outputs_to_float(v)
+                    else:
+                        d[k] = tf.cast(v, tf.float32)
+
+            cast_outputs_to_float(model_outputs)
+    else:
+        model_outputs = build_model_graph(features, labels,
+                                          mode == tf.estimator.ModeKeys.TRAIN,
+                                          params)
+        model_outputs.update({
+            'source_id': features['source_ids'],
+            'image_info': features['image_info'],
+        })
+
+    # First check if it is in PREDICT or EVAL mode to fill out predictions.
+    # Predictions are used during the eval step to generate metrics.
+    predictions = {}
+    if (mode == tf.estimator.ModeKeys.PREDICT or
+            mode == tf.estimator.ModeKeys.EVAL):
+        if 'orig_images' in features:
+            model_outputs['orig_images'] = features['orig_images']
+        if labels and params['include_groundtruth_in_features']:
+            # Labels can only be embedded in predictions. The predition cannot output
+            # dictionary as a value.
+            predictions.update(labels)
+        model_outputs.pop('fpn_features', None)
+        predictions.update(model_outputs)
+        # If we are doing PREDICT, we can return here.
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            if params['use_tpu']:
+                return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
+                                                         predictions=predictions)
+            return tf.estimator.EstimatorSpec(mode=mode,
+                                              predictions=predictions)
+
+    # Set up training loss and learning rate.
+    global_step = tf.train.get_or_create_global_step()
+    if params['learning_rate_type'] == 'step':
+        learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
+            global_step,
+            params['init_learning_rate'],
+            params['warmup_learning_rate'],
+            params['warmup_steps'],
+            params['learning_rate_levels'],
+            params['learning_rate_steps'])
+    elif params['learning_rate_type'] == 'cosine':
+        learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
+            global_step,
+            params['init_learning_rate'],
+            params['warmup_learning_rate'],
+            params['warmup_steps'],
+            params['total_steps'])
+    else:
+        raise ValueError('Unsupported learning rate type: `{}`!'
+                         .format(params['learning_rate_type']))
+    # score_loss and box_loss are for logging. only total_loss is optimized.
+    total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
+        model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
+        labels, params)
+
+    (total_fast_rcnn_loss, fast_rcnn_class_loss,
+     fast_rcnn_box_loss) = losses.fast_rcnn_loss(
+        model_outputs['class_outputs'], model_outputs['box_outputs'],
+        model_outputs['class_targets'], model_outputs['box_targets'], model_outputs['selected_class_targets'], params)
+    # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py  # pylint: disable=line-too-long
+    if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
+        mask_loss = losses.mask_rcnn_loss(
+            model_outputs['mask_outputs'], model_outputs['mask_targets'],
+            model_outputs['selected_class_targets'], params)
+    else:
+        mask_loss = 0.
+    if variable_filter_fn and ('resnet' in params['backbone']):
+        var_list = variable_filter_fn(tf.trainable_variables(),
+                                      params['backbone'] + '/')
+    else:
+        var_list = tf.trainable_variables()
+    l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
+        tf.nn.l2_loss(v)
+        for v in var_list
+        if 'batch_normalization' not in v.name and 'bias' not in v.name
+    ])
+    total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss)
+
+    host_call = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        optimizer = create_optimizer(learning_rate, params)
+        if params['use_tpu']:
+            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+
+        scaffold_fn = None
+        if params['warm_start_path']:
+
+            def warm_start_scaffold_fn():
+                logging.info(
+                    'model_fn warm start from: %s,', params['warm_start_path'])
+                assignment_map = _build_assigment_map(
+                    optimizer,
+                    prefix=None,
+                    skip_variables_regex=params['skip_checkpoint_variables'])
+                tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
+                return tf.train.Scaffold()
+
+            scaffold_fn = warm_start_scaffold_fn
+
+        elif params['checkpoint']:
+
+            def backbone_scaffold_fn():
+                """Loads pretrained model through scaffold function."""
+                # Exclude all variable of optimizer.
+                vars_to_load = _build_assigment_map(
+                    optimizer,
+                    prefix=params['backbone'] + '/',
+                    skip_variables_regex=params['skip_checkpoint_variables'])
+                tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
+                if not vars_to_load:
+                    raise ValueError('Variables to load is empty.')
+                return tf.train.Scaffold()
+
+            scaffold_fn = backbone_scaffold_fn
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
+        if params['global_gradient_clip_ratio'] > 0:
+            # Clips the gradients for training stability.
+            # Refer: https://arxiv.org/abs/1211.5063
+            with tf.name_scope('clipping'):
+                old_grads, variables = zip(*grads_and_vars)
+                num_weights = sum(
+                    g.shape.num_elements() for g in old_grads if g is not None)
+                clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
+                    num_weights)
+                logging.info(
+                    'Global clip norm set to %g for %d variables with %d elements.',
+                    clip_norm, sum(1 for g in old_grads if g is not None),
+                    num_weights)
+                gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
+        else:
+            gradients, variables = zip(*grads_and_vars)
+        grads_and_vars = []
+        # Special treatment for biases (beta is named as bias in reference model)
+        # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113  # pylint: disable=line-too-long
+        for grad, var in zip(gradients, variables):
+            if grad is not None and ('beta' in var.name or 'bias' in var.name):
+                grad = 2.0 * grad
+            grads_and_vars.append((grad, var))
+
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.apply_gradients(
+                grads_and_vars, global_step=global_step)
+
+        if params['use_host_call']:
+            def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
+                             rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
+                             fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
+                             learning_rate):
+                """Training host call. Creates scalar summaries for training metrics.
+
+                This function is executed on the CPU and should not directly reference
+                any Tensors in the rest of the `model_fn`. To pass Tensors from the
+                model to the `metric_fn`, provide as part of the `host_call`. See
+                https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
+                for more information.
+
+                Arguments should match the list of `Tensor` objects passed as the second
+                element in the tuple passed to `host_call`.
+
+                Args:
+                  global_step: `Tensor with shape `[batch, ]` for the global_step.
+                  total_loss: `Tensor` with shape `[batch, ]` for the training loss.
+                  total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
+                    loss.
+                  rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
+                    score loss.
+                  rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
+                    box loss.
+                  total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
+                    training Mask-RCNN loss.
+                  fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
+                    training Mask-RCNN class loss.
+                  fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
+                    training Mask-RCNN box loss.
+                  mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
+                    mask loss.
+                  l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
+                    regularization loss.
+                  learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
+
+                Returns:
+                  List of summary ops to run on the CPU host.
+                """
+                # Outfeed supports int32 but global_step is expected to be int64.
+                global_step = tf.reduce_mean(global_step)
+                # Host call fns are executed FLAGS.iterations_per_loop times after one
+                # TPU loop is finished, setting max_queue value to the same as number of
+                # iterations will make the summary writer only flush the data to storage
+                # once per loop.
+                with (tf2.summary.create_file_writer(
+                        params['model_dir'],
+                        max_queue=params['iterations_per_loop']).as_default()):
+                    with tf2.summary.record_if(True):
+                        tf2.summary.scalar(
+                            'total_loss', tf.reduce_mean(total_loss), step=global_step)
+                        tf2.summary.scalar(
+                            'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
+                        tf2.summary.scalar(
+                            'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
+                            step=global_step)
+                        if params['include_mask']:
+                            tf2.summary.scalar(
+                                'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
+                        tf2.summary.scalar(
+                            'l2_regularization_loss',
+                            tf.reduce_mean(l2_regularization_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'learning_rate', tf.reduce_mean(learning_rate),
+                            step=global_step)
+
+                        return tf.summary.all_v2_summary_ops()
+
+            # To log the loss, current learning rate, and epoch for Tensorboard, the
+            # summary op needs to be run on the host CPU via host_call. host_call
+            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+            # dimension. These Tensors are implicitly concatenated to
+            # [params['batch_size']].
+            global_step_t = tf.reshape(global_step, [1])
+            total_loss_t = tf.reshape(total_loss, [1])
+            total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
+            rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
+            rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
+            total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
+            fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
+            fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
+            mask_loss_t = tf.reshape(mask_loss, [1])
+            l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
+            learning_rate_t = tf.reshape(learning_rate, [1])
+            host_call = (host_call_fn,
+                         [global_step_t, total_loss_t, total_rpn_loss_t,
+                          rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
+                          fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
+                          mask_loss_t, l2_regularization_loss, learning_rate_t])
+    else:
+        train_op = None
+        scaffold_fn = None
 
-  # First check if it is in PREDICT or EVAL mode to fill out predictions.
-  # Predictions are used during the eval step to generate metrics.
-  predictions = {}
-  if (mode == tf.estimator.ModeKeys.PREDICT or
-      mode == tf.estimator.ModeKeys.EVAL):
-    if 'orig_images' in features:
-      model_outputs['orig_images'] = features['orig_images']
-    if labels and params['include_groundtruth_in_features']:
-      # Labels can only be embedded in predictions. The predition cannot output
-      # dictionary as a value.
-      predictions.update(labels)
-    model_outputs.pop('fpn_features', None)
-    predictions.update(model_outputs)
-    # If we are doing PREDICT, we can return here.
-    if mode == tf.estimator.ModeKeys.PREDICT:
-      if params['use_tpu']:
-        return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
-                                                 predictions=predictions)
-      return tf.estimator.EstimatorSpec(mode=mode,
-                                        predictions=predictions)
-
-  # Set up training loss and learning rate.
-  global_step = tf.train.get_or_create_global_step()
-  if params['learning_rate_type'] == 'step':
-    learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
-        global_step,
-        params['init_learning_rate'],
-        params['warmup_learning_rate'],
-        params['warmup_steps'],
-        params['learning_rate_levels'],
-        params['learning_rate_steps'])
-  elif params['learning_rate_type'] == 'cosine':
-    learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
-        global_step,
-        params['init_learning_rate'],
-        params['warmup_learning_rate'],
-        params['warmup_steps'],
-        params['total_steps'])
-  else:
-    raise ValueError('Unsupported learning rate type: `{}`!'
-                     .format(params['learning_rate_type']))
-  # score_loss and box_loss are for logging. only total_loss is optimized.
-  total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
-      model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
-      labels, params)
-
-  (total_fast_rcnn_loss, fast_rcnn_class_loss,
-   fast_rcnn_box_loss) = losses.fast_rcnn_loss(
-       model_outputs['class_outputs'], model_outputs['box_outputs'],
-       model_outputs['class_targets'], model_outputs['box_targets'], params)
-  # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py  # pylint: disable=line-too-long
-  if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
-    mask_loss = losses.mask_rcnn_loss(
-        model_outputs['mask_outputs'], model_outputs['mask_targets'],
-        model_outputs['selected_class_targets'], params)
-  else:
-    mask_loss = 0.
-  if variable_filter_fn and ('resnet' in params['backbone']):
-    var_list = variable_filter_fn(tf.trainable_variables(),
-                                  params['backbone'] + '/')
-  else:
-    var_list = tf.trainable_variables()
-  l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
-      tf.nn.l2_loss(v)
-      for v in var_list
-      if 'batch_normalization' not in v.name and 'bias' not in v.name
-  ])
-  total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss +
-                l2_regularization_loss)
-
-  host_call = None
-  if mode == tf.estimator.ModeKeys.TRAIN:
-    optimizer = create_optimizer(learning_rate, params)
     if params['use_tpu']:
-      optimizer = tf.tpu.CrossShardOptimizer(optimizer)
-
-    scaffold_fn = None
-    if params['warm_start_path']:
-
-      def warm_start_scaffold_fn():
-        logging.info(
-            'model_fn warm start from: %s,', params['warm_start_path'])
-        assignment_map = _build_assigment_map(
-            optimizer,
-            prefix=None,
-            skip_variables_regex=params['skip_checkpoint_variables'])
-        tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
-        return tf.train.Scaffold()
-
-      scaffold_fn = warm_start_scaffold_fn
-
-    elif params['checkpoint']:
-
-      def backbone_scaffold_fn():
-        """Loads pretrained model through scaffold function."""
-        # Exclude all variable of optimizer.
-        vars_to_load = _build_assigment_map(
-            optimizer,
-            prefix=params['backbone'] + '/',
-            skip_variables_regex=params['skip_checkpoint_variables'])
-        tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
-        if not vars_to_load:
-          raise ValueError('Variables to load is empty.')
-        return tf.train.Scaffold()
-
-      scaffold_fn = backbone_scaffold_fn
-
-    # Batch norm requires update_ops to be added as a train_op dependency.
-    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-    grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
-    if params['global_gradient_clip_ratio'] > 0:
-      # Clips the gradients for training stability.
-      # Refer: https://arxiv.org/abs/1211.5063
-      with tf.name_scope('clipping'):
-        old_grads, variables = zip(*grads_and_vars)
-        num_weights = sum(
-            g.shape.num_elements() for g in old_grads if g is not None)
-        clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
-            num_weights)
-        logging.info(
-            'Global clip norm set to %g for %d variables with %d elements.',
-            clip_norm, sum(1 for g in old_grads if g is not None),
-            num_weights)
-        gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
-    else:
-      gradients, variables = zip(*grads_and_vars)
-    grads_and_vars = []
-    # Special treatment for biases (beta is named as bias in reference model)
-    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113  # pylint: disable=line-too-long
-    for grad, var in zip(gradients, variables):
-      if grad is not None and ('beta' in var.name or 'bias' in var.name):
-        grad = 2.0 * grad
-      grads_and_vars.append((grad, var))
-
-    with tf.control_dependencies(update_ops):
-      train_op = optimizer.apply_gradients(
-          grads_and_vars, global_step=global_step)
-
-    if params['use_host_call']:
-      def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
-                       rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
-                       fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
-                       learning_rate):
-        """Training host call. Creates scalar summaries for training metrics.
-
-        This function is executed on the CPU and should not directly reference
-        any Tensors in the rest of the `model_fn`. To pass Tensors from the
-        model to the `metric_fn`, provide as part of the `host_call`. See
-        https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
-        for more information.
-
-        Arguments should match the list of `Tensor` objects passed as the second
-        element in the tuple passed to `host_call`.
-
-        Args:
-          global_step: `Tensor with shape `[batch, ]` for the global_step.
-          total_loss: `Tensor` with shape `[batch, ]` for the training loss.
-          total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
-            loss.
-          rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
-            score loss.
-          rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
-            box loss.
-          total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
-            training Mask-RCNN loss.
-          fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
-            training Mask-RCNN class loss.
-          fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
-            training Mask-RCNN box loss.
-          mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
-            mask loss.
-          l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
-            regularization loss.
-          learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
-
-        Returns:
-          List of summary ops to run on the CPU host.
-        """
-        # Outfeed supports int32 but global_step is expected to be int64.
-        global_step = tf.reduce_mean(global_step)
-        # Host call fns are executed FLAGS.iterations_per_loop times after one
-        # TPU loop is finished, setting max_queue value to the same as number of
-        # iterations will make the summary writer only flush the data to storage
-        # once per loop.
-        with (tf2.summary.create_file_writer(
-            params['model_dir'],
-            max_queue=params['iterations_per_loop']).as_default()):
-          with tf2.summary.record_if(True):
-            tf2.summary.scalar(
-                'total_loss', tf.reduce_mean(total_loss), step=global_step)
-            tf2.summary.scalar(
-                'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
-                step=global_step)
-            tf2.summary.scalar(
-                'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
-                step=global_step)
-            tf2.summary.scalar(
-                'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
-            tf2.summary.scalar(
-                'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
-                step=global_step)
-            tf2.summary.scalar(
-                'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
-                step=global_step)
-            tf2.summary.scalar(
-                'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
-                step=global_step)
-            if params['include_mask']:
-              tf2.summary.scalar(
-                  'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
-            tf2.summary.scalar(
-                'l2_regularization_loss',
-                tf.reduce_mean(l2_regularization_loss),
-                step=global_step)
-            tf2.summary.scalar(
-                'learning_rate', tf.reduce_mean(learning_rate),
-                step=global_step)
-
-            return tf.summary.all_v2_summary_ops()
-
-      # To log the loss, current learning rate, and epoch for Tensorboard, the
-      # summary op needs to be run on the host CPU via host_call. host_call
-      # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
-      # dimension. These Tensors are implicitly concatenated to
-      # [params['batch_size']].
-      global_step_t = tf.reshape(global_step, [1])
-      total_loss_t = tf.reshape(total_loss, [1])
-      total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
-      rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
-      rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
-      total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
-      fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
-      fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
-      mask_loss_t = tf.reshape(mask_loss, [1])
-      l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
-      learning_rate_t = tf.reshape(learning_rate, [1])
-      host_call = (host_call_fn,
-                   [global_step_t, total_loss_t, total_rpn_loss_t,
-                    rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
-                    fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
-                    mask_loss_t, l2_regularization_loss, learning_rate_t])
-  else:
-    train_op = None
-    scaffold_fn = None
-
-  if params['use_tpu']:
-    return tf.estimator.tpu.TPUEstimatorSpec(
-        mode=mode,
-        loss=total_loss,
-        train_op=train_op,
-        host_call=host_call,
-        scaffold_fn=scaffold_fn)
-  return tf.estimator.EstimatorSpec(
-      mode=mode, loss=total_loss, train_op=train_op)
+        return tf.estimator.tpu.TPUEstimatorSpec(
+            mode=mode,
+            loss=total_loss,
+            train_op=train_op,
+            host_call=host_call,
+            scaffold_fn=scaffold_fn)
+    return tf.estimator.EstimatorSpec(
+        mode=mode, loss=total_loss, train_op=train_op)
 
 
 def mask_rcnn_model_fn(features, labels, mode, params):
-  """Mask-RCNN model."""
-  with tf.variable_scope('', reuse=tf.AUTO_REUSE):
-    return _model_fn(
-        features,
-        labels,
-        mode,
-        params,
-        variable_filter_fn=remove_variables)
+    """Mask-RCNN model."""
+    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+        return _model_fn(
+            features,
+            labels,
+            mode,
+            params,
+            variable_filter_fn=remove_variables)
diff --git a/models/official/mask_rcnn/mask_rcnn_model_back.py b/models/official/mask_rcnn/mask_rcnn_model_back.py
new file mode 100644
index 000000000..3c221169e
--- /dev/null
+++ b/models/official/mask_rcnn/mask_rcnn_model_back.py
@@ -0,0 +1,685 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the Mask-RCNN Model.
+
+Defines model_fn of Mask-RCNN for TF Estimator. The model_fn includes Mask-RCNN
+model architecture, loss function, learning rate schedule, and evaluation
+procedure.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import re
+from absl import logging
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf2
+
+import anchors
+import fpn
+import heads
+import learning_rates
+import losses
+import postprocess_ops
+import resnet
+import roi_ops
+import spatial_transform_ops
+import training_ops
+import sys
+sys.path.append('tpu/models/official/mnasnet')
+import mnasnet_models
+
+
+def create_optimizer(learning_rate, params):
+  """Creates optimized based on the specified flags."""
+  if params['optimizer'] == 'momentum':
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate, momentum=params['momentum'])
+  elif params['optimizer'] == 'adam':
+    optimizer = tf.train.AdamOptimizer(learning_rate)
+  elif params['optimizer'] == 'adadelta':
+    optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+  elif params['optimizer'] == 'adagrad':
+    optimizer = tf.train.AdagradOptimizer(learning_rate)
+  elif params['optimizer'] == 'rmsprop':
+    optimizer = tf.train.RMSPropOptimizer(
+        learning_rate, momentum=params['momentum'])
+  elif params['optimizer'] == 'lars':
+    try:
+      from tensorflow.contrib.opt import LARSOptimizer  # pylint: disable=g-import-not-at-top
+
+      optimizer = LARSOptimizer(
+          learning_rate,
+          momentum=params['momentum'],
+          weight_decay=params['lars_weight_decay'],
+          skip_list=['batch_normalization', 'bias'])
+    except ImportError as e:
+      logging.exception('LARSOptimizer is currently not supported '
+                        'in TensorFlow 2.x.')
+      raise e
+
+  else:
+    raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
+  return optimizer
+
+
+def remove_variables(variables, prefix):
+  """Removes low-level variables from the input.
+
+  Removing low-level parameters (e.g., initial convolution layer) from training
+  usually leads to higher training speed and slightly better testing accuracy.
+  The intuition is that the low-level architecture (e.g., ResNet-50) is able to
+  capture low-level features such as edges; therefore, it does not need to be
+  fine-tuned for the detection task.
+
+  Args:
+    variables: all the variables in training
+    prefix: prefix of backbone
+
+  Returns:
+    var_list: a list containing variables for training
+
+  """
+  # Freeze at conv2 based on reference model.
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194  # pylint: disable=line-too-long
+  remove_list = []
+  remove_list.append(prefix + 'conv2d/')
+  remove_list.append(prefix + 'batch_normalization/')
+  for i in range(1, 11):
+    remove_list.append(prefix + 'conv2d_{}/'.format(i))
+    remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
+
+  def _is_kept(variable):
+    for rm_str in remove_list:
+      if rm_str in variable.name:
+        return False
+    return True
+
+  var_list = [v for v in variables if _is_kept(v)]
+  return var_list
+
+
+def compute_model_statistics(batch_size, is_training=True):
+  """Compute number of parameters and FLOPS."""
+  num_trainable_params = np.sum(
+      [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
+  logging.info('number of trainable params: %d', num_trainable_params)
+
+  options = tf.profiler.ProfileOptionBuilder.float_operation()
+  options['output'] = 'none'
+  flops = tf.profiler.profile(
+      tf.get_default_graph(), options=options).total_float_ops
+  flops_per_image = flops / batch_size
+  if is_training:
+    logging.info(
+        'number of FLOPS per image: %f in training', flops_per_image)
+  else:
+    logging.info(
+        'number of FLOPS per image: %f in eval', flops_per_image)
+
+
+def build_model_graph(features, labels, is_training, params):
+  """Builds the forward model graph."""
+  use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
+  is_gpu_inference = (not is_training and use_batched_nms)
+  model_outputs = {}
+
+  if is_training and params['transpose_input']:
+    if (params['backbone'].startswith('resnet') and
+        params['conv0_space_to_depth_block_size'] > 0):
+      features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+    else:
+      features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
+
+  batch_size, image_height, image_width, _ = (
+      features['images'].get_shape().as_list())
+
+  conv0_space_to_depth_block_size = 0
+  if (is_training and
+      (params['backbone'].startswith('resnet') and
+       params['conv0_space_to_depth_block_size'] > 0)):
+    conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
+    image_height *= conv0_space_to_depth_block_size
+    image_width *= conv0_space_to_depth_block_size
+
+  if 'source_ids' not in features:
+    features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
+
+  all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
+                                params['num_scales'], params['aspect_ratios'],
+                                params['anchor_scale'],
+                                (image_height, image_width))
+
+  if 'resnet' in params['backbone']:
+    with tf.variable_scope(params['backbone']):
+      resnet_fn = resnet.resnet_v1(
+          params['backbone'],
+          conv0_kernel_size=params['conv0_kernel_size'],
+          conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
+          num_batch_norm_group=params['num_batch_norm_group'])
+      backbone_feats = resnet_fn(
+          features['images'],
+          (params['is_training_bn'] and is_training))
+  elif 'mnasnet' in params['backbone']:
+    with tf.variable_scope(params['backbone']):
+      _, endpoints = mnasnet_models.build_mnasnet_base(
+          features['images'],
+          params['backbone'],
+          training=(params['is_training_bn'] and is_training),
+          override_params={'use_keras': False})
+
+      backbone_feats = {
+          2: endpoints['reduction_2'],
+          3: endpoints['reduction_3'],
+          4: endpoints['reduction_4'],
+          5: endpoints['reduction_5'],
+      }
+  else:
+    raise ValueError('Not a valid backbone option: %s' % params['backbone'])
+
+  fpn_feats = fpn.fpn(
+      backbone_feats, params['min_level'], params['max_level'])
+  model_outputs.update({
+      'fpn_features': fpn_feats,
+  })
+
+  rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
+      fpn_feats,
+      params['min_level'], params['max_level'],
+      len(params['aspect_ratios'] * params['num_scales']))
+
+  if is_training:
+    rpn_pre_nms_topn = params['rpn_pre_nms_topn']
+    rpn_post_nms_topn = params['rpn_post_nms_topn']
+  else:
+    rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
+    rpn_post_nms_topn = params['test_rpn_post_nms_topn']
+
+  rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
+      rpn_score_outputs,
+      rpn_box_outputs,
+      all_anchors,
+      features['image_info'],
+      rpn_pre_nms_topn,
+      rpn_post_nms_topn,
+      params['rpn_nms_threshold'],
+      params['rpn_min_size'],
+      bbox_reg_weights=None,
+      use_batched_nms=use_batched_nms)
+  rpn_box_rois = tf.to_float(rpn_box_rois)
+  if is_training:
+    rpn_box_rois = tf.stop_gradient(rpn_box_rois)
+    rpn_box_scores = tf.stop_gradient(rpn_box_scores)
+
+  if is_training:
+    # Sampling
+    box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
+        training_ops.proposal_label_op(
+            rpn_box_rois,
+            labels['gt_boxes'],
+            labels['gt_classes'],
+            batch_size_per_im=params['batch_size_per_im'],
+            fg_fraction=params['fg_fraction'],
+            fg_thresh=params['fg_thresh'],
+            bg_thresh_hi=params['bg_thresh_hi'],
+            bg_thresh_lo=params['bg_thresh_lo']))
+
+  # Performs multi-level RoIAlign.
+  box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+      fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
+
+  class_outputs, box_outputs, _ = heads.box_head(
+      box_roi_features, num_classes=params['num_classes'],
+      mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
+
+  if not is_training:
+    if is_gpu_inference:
+      generate_detections_fn = postprocess_ops.generate_detections_gpu
+    else:
+      generate_detections_fn = postprocess_ops.generate_detections_tpu
+    detections = generate_detections_fn(
+        class_outputs,
+        box_outputs,
+        rpn_box_rois,
+        features['image_info'],
+        params['test_rpn_post_nms_topn'],
+        params['test_detections_per_image'],
+        params['test_nms'],
+        params['bbox_reg_weights'])
+
+    model_outputs.update({
+        'num_detections': detections[0],
+        'detection_boxes': detections[1],
+        'detection_classes': detections[2],
+        'detection_scores': detections[3],
+    })
+  else:
+    encoded_box_targets = training_ops.encode_box_targets(
+        rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
+    model_outputs.update({
+        'rpn_score_outputs': rpn_score_outputs,
+        'rpn_box_outputs': rpn_box_outputs,
+        'class_outputs': class_outputs,
+        'box_outputs': box_outputs,
+        'class_targets': class_targets,
+        'box_targets': encoded_box_targets,
+        'box_rois': rpn_box_rois,
+    })
+
+  # Faster-RCNN mode.
+  if not params['include_mask']:
+    # Print #parameters and #FLOPs in model.
+    compute_model_statistics(batch_size, is_training=is_training)
+
+    return model_outputs
+
+  # Mask sampling
+  if not is_training:
+    selected_box_rois = model_outputs['detection_boxes']
+    class_indices = model_outputs['detection_classes']
+    # If using GPU for inference, delay the cast until when Gather ops show up
+    # since GPU inference supports float point better.
+    # TODO(laigd): revisit this when newer versions of GPU libraries is
+    # released.
+    if not is_gpu_inference:
+      class_indices = tf.to_int32(class_indices)
+  else:
+    (selected_class_targets, selected_box_targets, selected_box_rois,
+     proposal_to_label_map) = (
+         training_ops.select_fg_for_masks(
+             class_targets, box_targets, rpn_box_rois,
+             proposal_to_label_map,
+             max_num_fg=int(
+                 params['batch_size_per_im'] * params['fg_fraction'])))
+    class_indices = tf.to_int32(selected_class_targets)
+
+  mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+      fpn_feats,
+      selected_box_rois,
+      output_size=14,
+      is_gpu_inference=is_gpu_inference)
+  mask_outputs = heads.mask_head(
+      mask_roi_features,
+      class_indices,
+      num_classes=params['num_classes'],
+      mrcnn_resolution=params['mrcnn_resolution'],
+      is_gpu_inference=is_gpu_inference)
+
+  # Print #parameters and #FLOPs in model.
+  compute_model_statistics(batch_size, is_training=is_training)
+
+  if is_training:
+    mask_targets = training_ops.get_mask_targets(
+        selected_box_rois, proposal_to_label_map, selected_box_targets,
+        labels['cropped_gt_masks'], params['mrcnn_resolution'])
+    model_outputs.update({
+        'mask_outputs': mask_outputs,
+        'mask_targets': mask_targets,
+        'selected_class_targets': selected_class_targets,
+    })
+  else:
+    model_outputs.update({
+        'detection_masks': tf.nn.sigmoid(mask_outputs),
+    })
+
+  return model_outputs
+
+
+def _build_assigment_map(optimizer, prefix=None, skip_variables_regex=None):
+  """Generate assigment map for loading checkpoints."""
+  optimizer_vars = set([var.name for var in optimizer.variables()])
+  all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
+  if not prefix:
+    prefix = ''
+  assignment_map = {}
+  for var in all_vars:
+    if var.name not in optimizer_vars:
+      var_name = var.name
+      # Trim the index of the variable.
+      if ':' in var_name:
+        var_name = var_name[:var_name.rindex(':')]
+      if skip_variables_regex and re.match(skip_variables_regex,
+                                           var_name[len(prefix):]):
+        continue
+      assignment_map[var_name[len(prefix):]] = var
+  return assignment_map
+
+
+def _model_fn(features, labels, mode, params, variable_filter_fn=None):
+  """Model defination for the Mask-RCNN model based on ResNet.
+
+  Args:
+    features: the input image tensor and auxiliary information, such as
+      `image_info` and `source_ids`. The image tensor has a shape of
+      [batch_size, height, width, 3]. The height and width are fixed and equal.
+    labels: the input labels in a dictionary. The labels include score targets
+      and box targets which are dense label maps. The labels are generated from
+      get_input_fn function in data/dataloader.py
+    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+    params: the dictionary defines hyperparameters of model. The default
+      settings are in default_hparams function in this file.
+    variable_filter_fn: the filter function that takes trainable_variables and
+      returns the variable list after applying the filter rule.
+
+  Returns:
+    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
+  """
+  if (mode == tf.estimator.ModeKeys.PREDICT or
+      mode == tf.estimator.ModeKeys.EVAL):
+    if ((params['include_groundtruth_in_features'] or
+         mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
+      # In include groundtruth for eval.
+      labels = features['labels']
+
+    if 'features' in features:
+      features = features['features']
+      # Otherwise, it is in export mode, the features is past in directly.
+
+  if params['precision'] == 'bfloat16':
+    with tf.tpu.bfloat16_scope():
+      model_outputs = build_model_graph(features, labels,
+                                        mode == tf.estimator.ModeKeys.TRAIN,
+                                        params)
+      model_outputs.update({
+          'source_id': features['source_ids'],
+          'image_info': features['image_info'],
+      })
+      def cast_outputs_to_float(d):
+        for k, v in sorted(six.iteritems(d)):
+          if isinstance(v, dict):
+            cast_outputs_to_float(v)
+          else:
+            d[k] = tf.cast(v, tf.float32)
+      cast_outputs_to_float(model_outputs)
+  else:
+    model_outputs = build_model_graph(features, labels,
+                                      mode == tf.estimator.ModeKeys.TRAIN,
+                                      params)
+    model_outputs.update({
+        'source_id': features['source_ids'],
+        'image_info': features['image_info'],
+    })
+
+  # First check if it is in PREDICT or EVAL mode to fill out predictions.
+  # Predictions are used during the eval step to generate metrics.
+  predictions = {}
+  if (mode == tf.estimator.ModeKeys.PREDICT or
+      mode == tf.estimator.ModeKeys.EVAL):
+    if 'orig_images' in features:
+      model_outputs['orig_images'] = features['orig_images']
+    if labels and params['include_groundtruth_in_features']:
+      # Labels can only be embedded in predictions. The predition cannot output
+      # dictionary as a value.
+      predictions.update(labels)
+    model_outputs.pop('fpn_features', None)
+    predictions.update(model_outputs)
+    # If we are doing PREDICT, we can return here.
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      if params['use_tpu']:
+        return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
+                                                 predictions=predictions)
+      return tf.estimator.EstimatorSpec(mode=mode,
+                                        predictions=predictions)
+
+  # Set up training loss and learning rate.
+  global_step = tf.train.get_or_create_global_step()
+  if params['learning_rate_type'] == 'step':
+    learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
+        global_step,
+        params['init_learning_rate'],
+        params['warmup_learning_rate'],
+        params['warmup_steps'],
+        params['learning_rate_levels'],
+        params['learning_rate_steps'])
+  elif params['learning_rate_type'] == 'cosine':
+    learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
+        global_step,
+        params['init_learning_rate'],
+        params['warmup_learning_rate'],
+        params['warmup_steps'],
+        params['total_steps'])
+  else:
+    raise ValueError('Unsupported learning rate type: `{}`!'
+                     .format(params['learning_rate_type']))
+  # score_loss and box_loss are for logging. only total_loss is optimized.
+  total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
+      model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
+      labels, params)
+
+  (total_fast_rcnn_loss, fast_rcnn_class_loss,
+   fast_rcnn_box_loss) = losses.fast_rcnn_loss(
+       model_outputs['class_outputs'], model_outputs['box_outputs'],
+       model_outputs['class_targets'], model_outputs['box_targets'], params)
+  # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py  # pylint: disable=line-too-long
+  if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
+    mask_loss = losses.mask_rcnn_loss(
+        model_outputs['mask_outputs'], model_outputs['mask_targets'],
+        model_outputs['selected_class_targets'], params)
+  else:
+    mask_loss = 0.
+  if variable_filter_fn and ('resnet' in params['backbone']):
+    var_list = variable_filter_fn(tf.trainable_variables(),
+                                  params['backbone'] + '/')
+  else:
+    var_list = tf.trainable_variables()
+  l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
+      tf.nn.l2_loss(v)
+      for v in var_list
+      if 'batch_normalization' not in v.name and 'bias' not in v.name
+  ])
+  total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss +
+                l2_regularization_loss)
+
+  host_call = None
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    optimizer = create_optimizer(learning_rate, params)
+    if params['use_tpu']:
+      optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+
+    scaffold_fn = None
+    if params['warm_start_path']:
+
+      def warm_start_scaffold_fn():
+        logging.info(
+            'model_fn warm start from: %s,', params['warm_start_path'])
+        assignment_map = _build_assigment_map(
+            optimizer,
+            prefix=None,
+            skip_variables_regex=params['skip_checkpoint_variables'])
+        tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
+        return tf.train.Scaffold()
+
+      scaffold_fn = warm_start_scaffold_fn
+
+    elif params['checkpoint']:
+
+      def backbone_scaffold_fn():
+        """Loads pretrained model through scaffold function."""
+        # Exclude all variable of optimizer.
+        vars_to_load = _build_assigment_map(
+            optimizer,
+            prefix=params['backbone'] + '/',
+            skip_variables_regex=params['skip_checkpoint_variables'])
+        tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
+        if not vars_to_load:
+          raise ValueError('Variables to load is empty.')
+        return tf.train.Scaffold()
+
+      scaffold_fn = backbone_scaffold_fn
+
+    # Batch norm requires update_ops to be added as a train_op dependency.
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+    grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
+    if params['global_gradient_clip_ratio'] > 0:
+      # Clips the gradients for training stability.
+      # Refer: https://arxiv.org/abs/1211.5063
+      with tf.name_scope('clipping'):
+        old_grads, variables = zip(*grads_and_vars)
+        num_weights = sum(
+            g.shape.num_elements() for g in old_grads if g is not None)
+        clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
+            num_weights)
+        logging.info(
+            'Global clip norm set to %g for %d variables with %d elements.',
+            clip_norm, sum(1 for g in old_grads if g is not None),
+            num_weights)
+        gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
+    else:
+      gradients, variables = zip(*grads_and_vars)
+    grads_and_vars = []
+    # Special treatment for biases (beta is named as bias in reference model)
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113  # pylint: disable=line-too-long
+    for grad, var in zip(gradients, variables):
+      if grad is not None and ('beta' in var.name or 'bias' in var.name):
+        grad = 2.0 * grad
+      grads_and_vars.append((grad, var))
+
+    with tf.control_dependencies(update_ops):
+      train_op = optimizer.apply_gradients(
+          grads_and_vars, global_step=global_step)
+
+    if params['use_host_call']:
+      def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
+                       rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
+                       fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
+                       learning_rate):
+        """Training host call. Creates scalar summaries for training metrics.
+
+        This function is executed on the CPU and should not directly reference
+        any Tensors in the rest of the `model_fn`. To pass Tensors from the
+        model to the `metric_fn`, provide as part of the `host_call`. See
+        https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
+        for more information.
+
+        Arguments should match the list of `Tensor` objects passed as the second
+        element in the tuple passed to `host_call`.
+
+        Args:
+          global_step: `Tensor with shape `[batch, ]` for the global_step.
+          total_loss: `Tensor` with shape `[batch, ]` for the training loss.
+          total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
+            loss.
+          rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
+            score loss.
+          rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
+            box loss.
+          total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
+            training Mask-RCNN loss.
+          fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
+            training Mask-RCNN class loss.
+          fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
+            training Mask-RCNN box loss.
+          mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
+            mask loss.
+          l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
+            regularization loss.
+          learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
+
+        Returns:
+          List of summary ops to run on the CPU host.
+        """
+        # Outfeed supports int32 but global_step is expected to be int64.
+        global_step = tf.reduce_mean(global_step)
+        # Host call fns are executed FLAGS.iterations_per_loop times after one
+        # TPU loop is finished, setting max_queue value to the same as number of
+        # iterations will make the summary writer only flush the data to storage
+        # once per loop.
+        with (tf2.summary.create_file_writer(
+            params['model_dir'],
+            max_queue=params['iterations_per_loop']).as_default()):
+          with tf2.summary.record_if(True):
+            tf2.summary.scalar(
+                'total_loss', tf.reduce_mean(total_loss), step=global_step)
+            tf2.summary.scalar(
+                'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
+                step=global_step)
+            tf2.summary.scalar(
+                'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
+                step=global_step)
+            tf2.summary.scalar(
+                'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
+            tf2.summary.scalar(
+                'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
+                step=global_step)
+            tf2.summary.scalar(
+                'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
+                step=global_step)
+            tf2.summary.scalar(
+                'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
+                step=global_step)
+            if params['include_mask']:
+              tf2.summary.scalar(
+                  'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
+            tf2.summary.scalar(
+                'l2_regularization_loss',
+                tf.reduce_mean(l2_regularization_loss),
+                step=global_step)
+            tf2.summary.scalar(
+                'learning_rate', tf.reduce_mean(learning_rate),
+                step=global_step)
+
+            return tf.summary.all_v2_summary_ops()
+
+      # To log the loss, current learning rate, and epoch for Tensorboard, the
+      # summary op needs to be run on the host CPU via host_call. host_call
+      # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+      # dimension. These Tensors are implicitly concatenated to
+      # [params['batch_size']].
+      global_step_t = tf.reshape(global_step, [1])
+      total_loss_t = tf.reshape(total_loss, [1])
+      total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
+      rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
+      rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
+      total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
+      fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
+      fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
+      mask_loss_t = tf.reshape(mask_loss, [1])
+      l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
+      learning_rate_t = tf.reshape(learning_rate, [1])
+      host_call = (host_call_fn,
+                   [global_step_t, total_loss_t, total_rpn_loss_t,
+                    rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
+                    fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
+                    mask_loss_t, l2_regularization_loss, learning_rate_t])
+  else:
+    train_op = None
+    scaffold_fn = None
+
+  if params['use_tpu']:
+    return tf.estimator.tpu.TPUEstimatorSpec(
+        mode=mode,
+        loss=total_loss,
+        train_op=train_op,
+        host_call=host_call,
+        scaffold_fn=scaffold_fn)
+  return tf.estimator.EstimatorSpec(
+      mode=mode, loss=total_loss, train_op=train_op)
+
+
+def mask_rcnn_model_fn(features, labels, mode, params):
+  """Mask-RCNN model."""
+  with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+    return _model_fn(
+        features,
+        labels,
+        mode,
+        params,
+        variable_filter_fn=remove_variables)
diff --git a/models/official/mask_rcnn/mask_rcnn_model_new.py b/models/official/mask_rcnn/mask_rcnn_model_new.py
new file mode 100644
index 000000000..f6517f4f9
--- /dev/null
+++ b/models/official/mask_rcnn/mask_rcnn_model_new.py
@@ -0,0 +1,798 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the Mask-RCNN Model.
+
+Defines model_fn of Mask-RCNN for TF Estimator. The model_fn includes Mask-RCNN
+model architecture, loss function, learning rate schedule, and evaluation
+procedure.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+import re
+from absl import logging
+import numpy as np
+import six
+import tensorflow.compat.v1 as tf
+import tensorflow.compat.v2 as tf2
+
+import anchors
+import fpn
+import heads
+import learning_rates
+import losses
+import postprocess_ops
+import resnet
+import roi_ops
+import spatial_transform_ops
+import training_ops
+import sys
+
+sys.path.append('tpu/models/official/mnasnet')
+
+
+# import mnasnet_models
+
+
+def create_optimizer(learning_rate, params):
+    """Creates optimized based on the specified flags."""
+    if params['optimizer'] == 'momentum':
+        optimizer = tf.train.MomentumOptimizer(
+            learning_rate, momentum=params['momentum'])
+    elif params['optimizer'] == 'adam':
+        optimizer = tf.train.AdamOptimizer(learning_rate)
+    elif params['optimizer'] == 'adadelta':
+        optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+    elif params['optimizer'] == 'adagrad':
+        optimizer = tf.train.AdagradOptimizer(learning_rate)
+    elif params['optimizer'] == 'rmsprop':
+        optimizer = tf.train.RMSPropOptimizer(
+            learning_rate, momentum=params['momentum'])
+    elif params['optimizer'] == 'lars':
+        try:
+            from tensorflow.contrib.opt import LARSOptimizer  # pylint: disable=g-import-not-at-top
+
+            optimizer = LARSOptimizer(
+                learning_rate,
+                momentum=params['momentum'],
+                weight_decay=params['lars_weight_decay'],
+                skip_list=['batch_normalization', 'bias'])
+        except ImportError as e:
+            logging.exception('LARSOptimizer is currently not supported '
+                              'in TensorFlow 2.x.')
+            raise e
+
+    else:
+        raise ValueError('Unsupported optimizer type %s.' % params['optimizer'])
+    return optimizer
+
+
+def remove_variables(variables, prefix):
+    """Removes low-level variables from the input.
+
+    Removing low-level parameters (e.g., initial convolution layer) from training
+    usually leads to higher training speed and slightly better testing accuracy.
+    The intuition is that the low-level architecture (e.g., ResNet-50) is able to
+    capture low-level features such as edges; therefore, it does not need to be
+    fine-tuned for the detection task.
+
+    Args:
+      variables: all the variables in training
+      prefix: prefix of backbone
+
+    Returns:
+      var_list: a list containing variables for training
+
+    """
+    # Freeze at conv2 based on reference model.
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/config.py#L194  # pylint: disable=line-too-long
+    remove_list = []
+    remove_list.append(prefix + 'conv2d/')
+    remove_list.append(prefix + 'batch_normalization/')
+    for i in range(1, 11):
+        remove_list.append(prefix + 'conv2d_{}/'.format(i))
+        remove_list.append(prefix + 'batch_normalization_{}/'.format(i))
+
+    def _is_kept(variable):
+        for rm_str in remove_list:
+            if rm_str in variable.name:
+                return False
+        return True
+
+    var_list = [v for v in variables if _is_kept(v)]
+    return var_list
+
+
+def compute_model_statistics(batch_size, is_training=True):
+    """Compute number of parameters and FLOPS."""
+    num_trainable_params = np.sum(
+        [np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
+    logging.info('number of trainable params: %d', num_trainable_params)
+
+    options = tf.profiler.ProfileOptionBuilder.float_operation()
+    options['output'] = 'none'
+    flops = tf.profiler.profile(
+        tf.get_default_graph(), options=options).total_float_ops
+    flops_per_image = flops / batch_size
+    if is_training:
+        logging.info(
+            'number of FLOPS per image: %f in training', flops_per_image)
+    else:
+        logging.info(
+            'number of FLOPS per image: %f in eval', flops_per_image)
+
+
+def build_box_outputs(mask, params):
+    with tf.tpu.bfloat16_scope():
+        with tf.variable_scope('bbox_head_by_mask'):
+
+            def build_transform_variable(l_r, min_cost, image_size):
+                variable = []
+                if l_r == "right":
+                    l_v = min_cost
+                    r_v = 0
+                elif l_r == "left":
+                    l_v = 0
+                    r_v = min_cost
+                else:
+                    raise ValueError('l_r must be left or right')
+
+                for i in range(image_size):
+                    row = []
+                    for k in range(i):
+                        row.append(l_v)
+                    row.append(1)
+                    for k in range(image_size - 1 - i):
+                        row.append(r_v)
+                    variable.append(row)
+
+                return variable
+
+            def cal_offset(scope, input, img_size, alpha=1e-4):
+                with tf.variable_scope(scope):
+                    c_left = tf.constant(build_transform_variable("left", -img_size * 2., img_size), dtype=tf.float32)
+                    c_right = tf.constant(build_transform_variable("right", -img_size * 2., img_size), dtype=tf.float32)
+                    if params['precision'] == 'bfloat16':
+                        c_left = tf.cast(c_left, tf.bfloat16)
+                        c_right = tf.cast(c_right, tf.bfloat16)
+                    with tf.variable_scope("mask"):
+                        if params['precision'] == 'bfloat16':
+                            net = tf.cast(tf.greater(input, alpha), tf.bfloat16)
+                        else:
+                            net = tf.cast(tf.greater(input, alpha), tf.float32)
+
+                        # for left
+                        net_left = tf.nn.relu(tf.matmul(net, c_left))
+                        mask_left = tf.stop_gradient(net_left, name="mask_left")
+
+                        # for right
+                        net_right = tf.nn.relu(tf.matmul(net, c_right))
+                        mask_right = tf.stop_gradient(net_right, name="mask")
+
+                    with tf.variable_scope("work"):
+                        offset_left = 1 - input
+                        left = offset_left + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+                        left = tf.reduce_sum(left * mask_left, axis=-1)
+
+                        right = input + tf.constant([float(i) for i in range(img_size)], dtype=tf.bfloat16)
+                        right = tf.reduce_sum(right * mask_right, axis=-1)
+
+                return left, right, [net_right, input]
+
+            batch_size, num_boxes, img_size, _ = mask.get_shape().as_list()
+            mask = tf.reshape(mask, [-1, img_size, img_size])
+            mask = tf.nn.dropout(mask, keep_prob=0.98)
+            mask = tf.clip_by_value(mask, 0, 1)
+            row = tf.reduce_max(mask, axis=2)  # h
+            col = tf.reduce_max(mask, axis=1)  # w
+            row_l, row_r, debug_row = cal_offset("cal_offset_row", row, img_size)
+            col_l, col_r, debug_col = cal_offset("cal_offset_col", col, img_size)
+            bbox = tf.stack([row_l, col_l, row_r, col_r], axis=-1)
+            bbox = tf.reshape(bbox, [batch_size, num_boxes, 4])
+
+    return bbox, img_size  # y1,x1,y2,x2
+
+
+def build_model_graph(features, labels, is_training, params):
+    """Builds the forward model graph."""
+    use_batched_nms = (not params['use_tpu'] and params['use_batched_nms'])
+    is_gpu_inference = (not is_training and use_batched_nms)
+    model_outputs = {}
+
+    if is_training and params['transpose_input']:
+        if (params['backbone'].startswith('resnet') and
+                params['conv0_space_to_depth_block_size'] > 0):
+            features['images'] = tf.transpose(features['images'], [2, 0, 1, 3])
+        else:
+            features['images'] = tf.transpose(features['images'], [3, 0, 1, 2])
+
+    batch_size, image_height, image_width, _ = (
+        features['images'].get_shape().as_list())
+
+    conv0_space_to_depth_block_size = 0
+    if (is_training and
+            (params['backbone'].startswith('resnet') and
+             params['conv0_space_to_depth_block_size'] > 0)):
+        conv0_space_to_depth_block_size = params['conv0_space_to_depth_block_size']
+        image_height *= conv0_space_to_depth_block_size
+        image_width *= conv0_space_to_depth_block_size
+
+    if 'source_ids' not in features:
+        features['source_ids'] = -1 * tf.ones([batch_size], dtype=tf.float32)
+
+    all_anchors = anchors.Anchors(params['min_level'], params['max_level'],
+                                  params['num_scales'], params['aspect_ratios'],
+                                  params['anchor_scale'],
+                                  (image_height, image_width))
+
+    if 'resnet' in params['backbone']:
+        with tf.variable_scope(params['backbone']):
+            resnet_fn = resnet.resnet_v1(
+                params['backbone'],
+                conv0_kernel_size=params['conv0_kernel_size'],
+                conv0_space_to_depth_block_size=conv0_space_to_depth_block_size,
+                num_batch_norm_group=params['num_batch_norm_group'])
+            backbone_feats = resnet_fn(
+                features['images'],
+                (params['is_training_bn'] and is_training))
+    elif 'mnasnet' in params['backbone']:
+        with tf.variable_scope(params['backbone']):
+            _, endpoints = mnasnet_models.build_mnasnet_base(
+                features['images'],
+                params['backbone'],
+                training=(params['is_training_bn'] and is_training),
+                override_params={'use_keras': False})
+
+            backbone_feats = {
+                2: endpoints['reduction_2'],
+                3: endpoints['reduction_3'],
+                4: endpoints['reduction_4'],
+                5: endpoints['reduction_5'],
+            }
+    else:
+        raise ValueError('Not a valid backbone option: %s' % params['backbone'])
+
+    fpn_feats = fpn.fpn(
+        backbone_feats, params['min_level'], params['max_level'])
+    model_outputs.update({
+        'fpn_features': fpn_feats,
+    })
+
+    rpn_score_outputs, rpn_box_outputs = heads.rpn_head(
+        fpn_feats,
+        params['min_level'], params['max_level'],
+        len(params['aspect_ratios'] * params['num_scales']))
+
+    if is_training:
+        rpn_pre_nms_topn = params['rpn_pre_nms_topn']
+        rpn_post_nms_topn = params['rpn_post_nms_topn']
+    else:
+        rpn_pre_nms_topn = params['test_rpn_pre_nms_topn']
+        rpn_post_nms_topn = params['test_rpn_post_nms_topn']
+
+    # rpn_box_rois: [ymin, xmin, ymax, xmax]
+    rpn_box_scores, rpn_box_rois = roi_ops.multilevel_propose_rois(
+        rpn_score_outputs,
+        rpn_box_outputs,
+        all_anchors,
+        features['image_info'],
+        rpn_pre_nms_topn,
+        rpn_post_nms_topn,
+        params['rpn_nms_threshold'],
+        params['rpn_min_size'],
+        bbox_reg_weights=None,
+        use_batched_nms=use_batched_nms)
+    rpn_box_rois = tf.to_float(rpn_box_rois)
+    if is_training:
+        rpn_box_rois = tf.stop_gradient(rpn_box_rois)
+        rpn_box_scores = tf.stop_gradient(rpn_box_scores)
+
+    if is_training:
+        # Sampling
+        box_targets, class_targets, rpn_box_rois, proposal_to_label_map = (
+            training_ops.proposal_label_op(
+                rpn_box_rois,
+                labels['gt_boxes'],
+                labels['gt_classes'],
+                batch_size_per_im=params['batch_size_per_im'],
+                fg_fraction=params['fg_fraction'],
+                fg_thresh=params['fg_thresh'],
+                bg_thresh_hi=params['bg_thresh_hi'],
+                bg_thresh_lo=params['bg_thresh_lo']))
+
+    # Performs multi-level RoIAlign.
+    box_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_feats, rpn_box_rois, output_size=7, is_gpu_inference=is_gpu_inference)
+
+    class_outputs, _, _ = heads.box_head(
+        box_roi_features, num_classes=params['num_classes'],
+        mlp_head_dim=params['fast_rcnn_mlp_head_dim'])
+
+    if not is_training:
+        if is_gpu_inference:
+            generate_detections_fn = postprocess_ops.generate_detections_gpu
+        else:
+            generate_detections_fn = postprocess_ops.generate_detections_tpu
+        detections = generate_detections_fn(
+            class_outputs,
+            rpn_box_rois,
+            features['image_info'],
+            params['test_rpn_post_nms_topn'],
+            params['test_detections_per_image'],
+            params['test_nms'],
+            params['bbox_reg_weights'])
+
+        model_outputs.update({
+            'num_detections': detections[0],
+            'detection_boxes': detections[1],
+            'detection_classes': detections[2],
+            'detection_scores': detections[3],
+        })
+    else:
+        # encoded_box_targets = training_ops.encode_box_targets(
+        #     rpn_box_rois, box_targets, class_targets, params['bbox_reg_weights'])
+        model_outputs.update({
+            'rpn_score_outputs': rpn_score_outputs,
+            'rpn_box_outputs': rpn_box_outputs,
+            'class_outputs': class_outputs,
+            'class_targets': class_targets,
+            # 'box_outputs': box_outputs,
+            # 'box_targets': box_targets,
+            # 'box_rois': rpn_box_rois,
+        })
+
+    # # Faster-RCNN mode.
+    # if not params['include_mask']:
+    #   # Print #parameters and #FLOPs in model.
+    #   compute_model_statistics(batch_size, is_training=is_training)
+    #
+    #   return model_outputs
+
+    def expand_boxes(boxes, scale):
+        # whereas `boxes` here is in [y1, x1, y2, x2] form
+        w_half = (boxes[..., 3] - boxes[..., 1]) * .5
+        h_half = (boxes[..., 2] - boxes[..., 0]) * .5
+        x_c = boxes[..., 1] + w_half
+        y_c = boxes[..., 0] + h_half
+
+        w_half *= scale
+        h_half *= scale
+
+        boxes_exp = tf.stack([y_c - h_half, x_c - w_half, y_c + h_half, x_c + w_half], axis=-1)
+
+        return boxes_exp
+
+    # Mask sampling
+    if not is_training:
+        selected_box_rois = model_outputs['detection_boxes']
+        class_indices = model_outputs['detection_classes']
+        # If using GPU for inference, delay the cast until when Gather ops show up
+        # since GPU inference supports float point better.
+        # TODO(laigd): revisit this when newer versions of GPU libraries is
+        # released.
+        selected_box_rois = expand_boxes(selected_box_rois, 1.2)
+        if not is_gpu_inference:
+            class_indices = tf.to_int32(class_indices)
+    else:
+        (selected_class_targets, selected_box_targets, selected_box_rois,
+         proposal_to_label_map) = (
+            training_ops.select_fg_for_masks(
+                class_targets, box_targets, rpn_box_rois,
+                proposal_to_label_map,
+                max_num_fg=int(
+                    params['batch_size_per_im'] * params['fg_fraction'])))
+
+        selected_box_rois = expand_boxes(selected_box_targets, 1.2)
+        class_indices = tf.to_int32(selected_class_targets)
+
+
+    import box_utils
+    selected_box_rois = box_utils.clip_boxes(selected_box_rois, 1024, 1024)
+    mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
+        fpn_feats,
+        selected_box_rois,
+        output_size=14,
+        is_gpu_inference=is_gpu_inference)
+    mask_outputs = heads.mask_head(
+        mask_roi_features,
+        class_indices,
+        num_classes=params['num_classes'],
+        mrcnn_resolution=params['mrcnn_resolution'],
+        is_gpu_inference=is_gpu_inference)
+
+    soft_mask_outputs = tf.nn.sigmoid(mask_outputs)
+    box_outputs_in_mak, image_size = build_box_outputs(soft_mask_outputs, params)
+    offset = tf.stack(
+        [selected_box_rois[..., 0], selected_box_rois[..., 1], selected_box_rois[..., 0], selected_box_rois[..., 1]],
+        axis=-1)
+    h_scale = (selected_box_rois[..., 2] - selected_box_rois[..., 0]) / image_size
+    w_scale = (selected_box_rois[..., 3] - selected_box_rois[..., 1]) / image_size
+    wh_scale = tf.stack([h_scale, w_scale, h_scale, w_scale], axis=-1)
+    if params['precision'] == 'bfloat16':
+        wh_scale = tf.cast(wh_scale, tf.bfloat16)
+        offset = tf.cast(offset, tf.bfloat16)
+
+    box_outputs = offset + box_outputs_in_mak * wh_scale
+
+    # Print #parameters and #FLOPs in model.
+    compute_model_statistics(batch_size, is_training=is_training)
+
+    if is_training:
+        mask_targets = training_ops.get_mask_targets(
+            selected_box_rois, proposal_to_label_map, selected_box_targets,
+            labels['cropped_gt_masks'], params['mrcnn_resolution'])
+        model_outputs.update({
+            'mask_outputs': mask_outputs,
+            'mask_targets': mask_targets,
+            'selected_class_targets': selected_class_targets,
+            'box_outputs': box_outputs,
+            'box_targets': selected_box_targets,
+        })
+    else:
+        model_outputs.update({
+            'detection_masks': soft_mask_outputs,
+            'detection_boxes': box_outputs,
+            'selected_box_rois': selected_box_rois
+        })
+
+    return model_outputs
+
+
+def _build_assigment_map(optimizer, prefix=None, skip_variables_regex=None):
+    """Generate assigment map for loading checkpoints."""
+    optimizer_vars = set([var.name for var in optimizer.variables()])
+    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=prefix)
+    if not prefix:
+        prefix = ''
+    assignment_map = {}
+    for var in all_vars:
+        if var.name not in optimizer_vars:
+            var_name = var.name
+            # Trim the index of the variable.
+            if ':' in var_name:
+                var_name = var_name[:var_name.rindex(':')]
+            if skip_variables_regex and re.match(skip_variables_regex,
+                                                 var_name[len(prefix):]):
+                continue
+            assignment_map[var_name[len(prefix):]] = var
+    return assignment_map
+
+
+def _model_fn(features, labels, mode, params, variable_filter_fn=None):
+    """Model defination for the Mask-RCNN model based on ResNet.
+
+    Args:
+      features: the input image tensor and auxiliary information, such as
+        `image_info` and `source_ids`. The image tensor has a shape of
+        [batch_size, height, width, 3]. The height and width are fixed and equal.
+      labels: the input labels in a dictionary. The labels include score targets
+        and box targets which are dense label maps. The labels are generated from
+        get_input_fn function in data/dataloader.py
+      mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+      params: the dictionary defines hyperparameters of model. The default
+        settings are in default_hparams function in this file.
+      variable_filter_fn: the filter function that takes trainable_variables and
+        returns the variable list after applying the filter rule.
+
+    Returns:
+      tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.
+    """
+    if (mode == tf.estimator.ModeKeys.PREDICT or
+            mode == tf.estimator.ModeKeys.EVAL):
+        if ((params['include_groundtruth_in_features'] or
+             mode == tf.estimator.ModeKeys.EVAL) and ('labels' in features)):
+            # In include groundtruth for eval.
+            labels = features['labels']
+
+        if 'features' in features:
+            features = features['features']
+            # Otherwise, it is in export mode, the features is past in directly.
+
+    if params['precision'] == 'bfloat16':
+        with tf.tpu.bfloat16_scope():
+            model_outputs = build_model_graph(features, labels,
+                                              mode == tf.estimator.ModeKeys.TRAIN,
+                                              params)
+            model_outputs.update({
+                'source_id': features['source_ids'],
+                'image_info': features['image_info'],
+            })
+
+            def cast_outputs_to_float(d):
+                for k, v in sorted(six.iteritems(d)):
+                    if isinstance(v, dict):
+                        cast_outputs_to_float(v)
+                    else:
+                        d[k] = tf.cast(v, tf.float32)
+
+            cast_outputs_to_float(model_outputs)
+    else:
+        model_outputs = build_model_graph(features, labels,
+                                          mode == tf.estimator.ModeKeys.TRAIN,
+                                          params)
+        model_outputs.update({
+            'source_id': features['source_ids'],
+            'image_info': features['image_info'],
+        })
+
+    # First check if it is in PREDICT or EVAL mode to fill out predictions.
+    # Predictions are used during the eval step to generate metrics.
+    predictions = {}
+    if (mode == tf.estimator.ModeKeys.PREDICT or
+            mode == tf.estimator.ModeKeys.EVAL):
+        if 'orig_images' in features:
+            model_outputs['orig_images'] = features['orig_images']
+        if labels and params['include_groundtruth_in_features']:
+            # Labels can only be embedded in predictions. The predition cannot output
+            # dictionary as a value.
+            predictions.update(labels)
+        model_outputs.pop('fpn_features', None)
+        predictions.update(model_outputs)
+        # If we are doing PREDICT, we can return here.
+        if mode == tf.estimator.ModeKeys.PREDICT:
+            if params['use_tpu']:
+                return tf.estimator.tpu.TPUEstimatorSpec(mode=mode,
+                                                         predictions=predictions)
+            return tf.estimator.EstimatorSpec(mode=mode,
+                                              predictions=predictions)
+
+    # Set up training loss and learning rate.
+    global_step = tf.train.get_or_create_global_step()
+    if params['learning_rate_type'] == 'step':
+        learning_rate = learning_rates.step_learning_rate_with_linear_warmup(
+            global_step,
+            params['init_learning_rate'],
+            params['warmup_learning_rate'],
+            params['warmup_steps'],
+            params['learning_rate_levels'],
+            params['learning_rate_steps'])
+    elif params['learning_rate_type'] == 'cosine':
+        learning_rate = learning_rates.cosine_learning_rate_with_linear_warmup(
+            global_step,
+            params['init_learning_rate'],
+            params['warmup_learning_rate'],
+            params['warmup_steps'],
+            params['total_steps'])
+    else:
+        raise ValueError('Unsupported learning rate type: `{}`!'
+                         .format(params['learning_rate_type']))
+    # score_loss and box_loss are for logging. only total_loss is optimized.
+    total_rpn_loss, rpn_score_loss, rpn_box_loss = losses.rpn_loss(
+        model_outputs['rpn_score_outputs'], model_outputs['rpn_box_outputs'],
+        labels, params)
+
+    (total_fast_rcnn_loss, fast_rcnn_class_loss,
+     fast_rcnn_box_loss) = losses.fast_rcnn_loss(
+        model_outputs['class_outputs'], model_outputs['box_outputs'],
+        model_outputs['class_targets'], model_outputs['box_targets'], model_outputs['selected_class_targets'], params)
+    # Only training has the mask loss. Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/model_builder.py  # pylint: disable=line-too-long
+    if mode == tf.estimator.ModeKeys.TRAIN and params['include_mask']:
+        mask_loss = losses.mask_rcnn_loss(
+            model_outputs['mask_outputs'], model_outputs['mask_targets'],
+            model_outputs['selected_class_targets'], params)
+    else:
+        mask_loss = 0.
+    if variable_filter_fn and ('resnet' in params['backbone']):
+        var_list = variable_filter_fn(tf.trainable_variables(),
+                                      params['backbone'] + '/')
+    else:
+        var_list = tf.trainable_variables()
+    l2_regularization_loss = params['l2_weight_decay'] * tf.add_n([
+        tf.nn.l2_loss(v)
+        for v in var_list
+        if 'batch_normalization' not in v.name and 'bias' not in v.name
+    ])
+    total_loss = (total_rpn_loss + total_fast_rcnn_loss + mask_loss + l2_regularization_loss)
+
+    host_call = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+        optimizer = create_optimizer(learning_rate, params)
+        if params['use_tpu']:
+            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
+
+        scaffold_fn = None
+        if params['warm_start_path']:
+
+            def warm_start_scaffold_fn():
+                logging.info(
+                    'model_fn warm start from: %s,', params['warm_start_path'])
+                assignment_map = _build_assigment_map(
+                    optimizer,
+                    prefix=None,
+                    skip_variables_regex=params['skip_checkpoint_variables'])
+                tf.train.init_from_checkpoint(params['warm_start_path'], assignment_map)
+                return tf.train.Scaffold()
+
+            scaffold_fn = warm_start_scaffold_fn
+
+        elif params['checkpoint']:
+
+            def backbone_scaffold_fn():
+                """Loads pretrained model through scaffold function."""
+                # Exclude all variable of optimizer.
+                vars_to_load = _build_assigment_map(
+                    optimizer,
+                    prefix=params['backbone'] + '/',
+                    skip_variables_regex=params['skip_checkpoint_variables'])
+                tf.train.init_from_checkpoint(params['checkpoint'], vars_to_load)
+                if not vars_to_load:
+                    raise ValueError('Variables to load is empty.')
+                return tf.train.Scaffold()
+
+            scaffold_fn = backbone_scaffold_fn
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
+        if params['global_gradient_clip_ratio'] > 0:
+            # Clips the gradients for training stability.
+            # Refer: https://arxiv.org/abs/1211.5063
+            with tf.name_scope('clipping'):
+                old_grads, variables = zip(*grads_and_vars)
+                num_weights = sum(
+                    g.shape.num_elements() for g in old_grads if g is not None)
+                clip_norm = params['global_gradient_clip_ratio'] * math.sqrt(
+                    num_weights)
+                logging.info(
+                    'Global clip norm set to %g for %d variables with %d elements.',
+                    clip_norm, sum(1 for g in old_grads if g is not None),
+                    num_weights)
+                gradients, _ = tf.clip_by_global_norm(old_grads, clip_norm)
+        else:
+            gradients, variables = zip(*grads_and_vars)
+        grads_and_vars = []
+        # Special treatment for biases (beta is named as bias in reference model)
+        # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/modeling/optimizer.py#L113  # pylint: disable=line-too-long
+        for grad, var in zip(gradients, variables):
+            if grad is not None and ('beta' in var.name or 'bias' in var.name):
+                grad = 2.0 * grad
+            grads_and_vars.append((grad, var))
+
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.apply_gradients(
+                grads_and_vars, global_step=global_step)
+
+        if params['use_host_call']:
+            def host_call_fn(global_step, total_loss, total_rpn_loss, rpn_score_loss,
+                             rpn_box_loss, total_fast_rcnn_loss, fast_rcnn_class_loss,
+                             fast_rcnn_box_loss, mask_loss, l2_regularization_loss,
+                             learning_rate):
+                """Training host call. Creates scalar summaries for training metrics.
+
+                This function is executed on the CPU and should not directly reference
+                any Tensors in the rest of the `model_fn`. To pass Tensors from the
+                model to the `metric_fn`, provide as part of the `host_call`. See
+                https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec
+                for more information.
+
+                Arguments should match the list of `Tensor` objects passed as the second
+                element in the tuple passed to `host_call`.
+
+                Args:
+                  global_step: `Tensor with shape `[batch, ]` for the global_step.
+                  total_loss: `Tensor` with shape `[batch, ]` for the training loss.
+                  total_rpn_loss: `Tensor` with shape `[batch, ]` for the training RPN
+                    loss.
+                  rpn_score_loss: `Tensor` with shape `[batch, ]` for the training RPN
+                    score loss.
+                  rpn_box_loss: `Tensor` with shape `[batch, ]` for the training RPN
+                    box loss.
+                  total_fast_rcnn_loss: `Tensor` with shape `[batch, ]` for the
+                    training Mask-RCNN loss.
+                  fast_rcnn_class_loss: `Tensor` with shape `[batch, ]` for the
+                    training Mask-RCNN class loss.
+                  fast_rcnn_box_loss: `Tensor` with shape `[batch, ]` for the
+                    training Mask-RCNN box loss.
+                  mask_loss: `Tensor` with shape `[batch, ]` for the training Mask-RCNN
+                    mask loss.
+                  l2_regularization_loss: `Tensor` with shape `[batch, ]` for the
+                    regularization loss.
+                  learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.
+
+                Returns:
+                  List of summary ops to run on the CPU host.
+                """
+                # Outfeed supports int32 but global_step is expected to be int64.
+                global_step = tf.reduce_mean(global_step)
+                # Host call fns are executed FLAGS.iterations_per_loop times after one
+                # TPU loop is finished, setting max_queue value to the same as number of
+                # iterations will make the summary writer only flush the data to storage
+                # once per loop.
+                with (tf2.summary.create_file_writer(
+                        params['model_dir'],
+                        max_queue=params['iterations_per_loop']).as_default()):
+                    with tf2.summary.record_if(True):
+                        tf2.summary.scalar(
+                            'total_loss', tf.reduce_mean(total_loss), step=global_step)
+                        tf2.summary.scalar(
+                            'total_rpn_loss', tf.reduce_mean(total_rpn_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'rpn_score_loss', tf.reduce_mean(rpn_score_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'rpn_box_loss', tf.reduce_mean(rpn_box_loss), step=global_step)
+                        tf2.summary.scalar(
+                            'total_fast_rcnn_loss', tf.reduce_mean(total_fast_rcnn_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'fast_rcnn_class_loss', tf.reduce_mean(fast_rcnn_class_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'fast_rcnn_box_loss', tf.reduce_mean(fast_rcnn_box_loss),
+                            step=global_step)
+                        if params['include_mask']:
+                            tf2.summary.scalar(
+                                'mask_loss', tf.reduce_mean(mask_loss), step=global_step)
+                        tf2.summary.scalar(
+                            'l2_regularization_loss',
+                            tf.reduce_mean(l2_regularization_loss),
+                            step=global_step)
+                        tf2.summary.scalar(
+                            'learning_rate', tf.reduce_mean(learning_rate),
+                            step=global_step)
+
+                        return tf.summary.all_v2_summary_ops()
+
+            # To log the loss, current learning rate, and epoch for Tensorboard, the
+            # summary op needs to be run on the host CPU via host_call. host_call
+            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+            # dimension. These Tensors are implicitly concatenated to
+            # [params['batch_size']].
+            global_step_t = tf.reshape(global_step, [1])
+            total_loss_t = tf.reshape(total_loss, [1])
+            total_rpn_loss_t = tf.reshape(total_rpn_loss, [1])
+            rpn_score_loss_t = tf.reshape(rpn_score_loss, [1])
+            rpn_box_loss_t = tf.reshape(rpn_box_loss, [1])
+            total_fast_rcnn_loss_t = tf.reshape(total_fast_rcnn_loss, [1])
+            fast_rcnn_class_loss_t = tf.reshape(fast_rcnn_class_loss, [1])
+            fast_rcnn_box_loss_t = tf.reshape(fast_rcnn_box_loss, [1])
+            mask_loss_t = tf.reshape(mask_loss, [1])
+            l2_regularization_loss = tf.reshape(l2_regularization_loss, [1])
+            learning_rate_t = tf.reshape(learning_rate, [1])
+            host_call = (host_call_fn,
+                         [global_step_t, total_loss_t, total_rpn_loss_t,
+                          rpn_score_loss_t, rpn_box_loss_t, total_fast_rcnn_loss_t,
+                          fast_rcnn_class_loss_t, fast_rcnn_box_loss_t,
+                          mask_loss_t, l2_regularization_loss, learning_rate_t])
+    else:
+        train_op = None
+        scaffold_fn = None
+
+    if params['use_tpu']:
+        return tf.estimator.tpu.TPUEstimatorSpec(
+            mode=mode,
+            loss=total_loss,
+            train_op=train_op,
+            host_call=host_call,
+            scaffold_fn=scaffold_fn)
+    return tf.estimator.EstimatorSpec(
+        mode=mode, loss=total_loss, train_op=train_op)
+
+
+def mask_rcnn_model_fn(features, labels, mode, params):
+    """Mask-RCNN model."""
+    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+        return _model_fn(
+            features,
+            labels,
+            mode,
+            params,
+            variable_filter_fn=remove_variables)
diff --git a/models/official/mask_rcnn/postprocess_ops.py b/models/official/mask_rcnn/postprocess_ops.py
index ecabb8b08..4a829ce68 100644
--- a/models/official/mask_rcnn/postprocess_ops.py
+++ b/models/official/mask_rcnn/postprocess_ops.py
@@ -24,7 +24,6 @@
 
 
 def generate_detections_per_image_tpu(cls_outputs,
-                                      box_outputs,
                                       anchor_boxes,
                                       image_info,
                                       pre_nms_num_detections=1000,
@@ -70,18 +69,20 @@ def generate_detections_per_image_tpu(cls_outputs,
   top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)
 
   anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
-  box_outputs = tf.reshape(
-      box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
+  boxes = anchor_boxes
+  # box_outputs = tf.reshape(
+  #     box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
   class_indices = classes
-  box_outputs = tf.gather_nd(box_outputs,
-                             tf.stack([top_k_indices, class_indices], axis=1))
+  # box_outputs = tf.gather_nd(box_outputs,
+  #                            tf.stack([top_k_indices, class_indices], axis=1))
 
   # apply bounding box regression to anchors
-  boxes = box_utils.decode_boxes(
-      box_outputs, anchor_boxes, bbox_reg_weights)
+  # boxes = box_utils.decode_boxes(
+  #     box_outputs, anchor_boxes, bbox_reg_weights)
   boxes = box_utils.clip_boxes(
       boxes, image_info[0], image_info[1])
 
+
   list_of_all_boxes = []
   list_of_all_scores = []
   list_of_all_classes = []
@@ -134,7 +135,6 @@ def generate_detections_per_image_tpu(cls_outputs,
 
 
 def generate_detections_tpu(class_outputs,
-                            box_outputs,
                             anchor_boxes,
                             image_info,
                             pre_nms_num_detections=1000,
@@ -175,7 +175,7 @@ def generate_detections_tpu(class_outputs,
     num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])
     for i in range(batch_size):
       result = generate_detections_per_image_tpu(
-          softmax_class_outputs[i], box_outputs[i], anchor_boxes[i],
+          softmax_class_outputs[i], anchor_boxes[i],
           image_info[i], pre_nms_num_detections, post_nms_num_detections,
           nms_threshold, bbox_reg_weights)
 
@@ -192,7 +192,6 @@ def generate_detections_tpu(class_outputs,
 
 
 def generate_detections_gpu(class_outputs,
-                            box_outputs,
                             anchor_boxes,
                             image_info,
                             pre_nms_num_detections=1000,
@@ -232,22 +231,24 @@ def generate_detections_gpu(class_outputs,
 
     # Remove background
     scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
-    boxes = tf.slice(
-        tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
-        [0, 0, 1, 0], [-1, -1, -1, -1])
+    # boxes = tf.slice(
+    #     tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
+    #     [0, 0, 1, 0], [-1, -1, -1, -1])
 
     anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
                     tf.ones([1, 1, num_classes - 1, 1]))
 
     num_detections = num_boxes * (num_classes - 1)
 
-    boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
+    # boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
     scores = tf.reshape(scores, [batch_size, num_detections, 1])
     anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
 
     # Decode
-    boxes = box_utils.decode_boxes(
-        boxes, anchor_boxes, bbox_reg_weights)
+    # boxes = box_utils.decode_boxes(
+    #     boxes, anchor_boxes, bbox_reg_weights)
+
+    boxes = anchor_boxes
 
     # Clip boxes
     height = tf.expand_dims(image_info[:, 0:1], axis=-1)
diff --git a/models/official/mask_rcnn/postprocess_ops_back.py b/models/official/mask_rcnn/postprocess_ops_back.py
new file mode 100644
index 000000000..ecabb8b08
--- /dev/null
+++ b/models/official/mask_rcnn/postprocess_ops_back.py
@@ -0,0 +1,278 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops used to post-process raw detections."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+import box_utils
+
+
+def generate_detections_per_image_tpu(cls_outputs,
+                                      box_outputs,
+                                      anchor_boxes,
+                                      image_info,
+                                      pre_nms_num_detections=1000,
+                                      post_nms_num_detections=100,
+                                      nms_threshold=0.3,
+                                      bbox_reg_weights=(10., 10., 5., 5.)):
+  """Generate the final detections per image given the model outputs.
+
+  Args:
+    cls_outputs: a tensor with shape [N, num_classes], which stacks class
+      logit outputs on all feature levels. The N is the number of total anchors
+      on all levels. The num_classes is the number of classes predicted by the
+      model. Note that the cls_outputs should be the output of softmax().
+    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
+      regression outputs on all feature levels. The N is the number of total
+      anchors on all levels.
+    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
+      feature levels. The N is the number of total anchors on all levels.
+    image_info: a tensor of shape [5] which encodes the input image's [height,
+      width, scale, original_height, original_width]
+    pre_nms_num_detections: an integer that specifies the number of candidates
+      before NMS.
+    post_nms_num_detections: an integer that specifies the number of candidates
+      after NMS.
+    nms_threshold: a float number to specify the IOU threshold of NMS.
+    bbox_reg_weights: a list of 4 float scalars, which are default weights on
+      (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+  Returns:
+    detections: Tuple of tensors corresponding to number of valid boxes,
+    box coordinates, object categories for each boxes, and box scores
+    -- respectively.
+  """
+  num_boxes, num_classes = cls_outputs.get_shape().as_list()
+
+  # Remove background class scores.
+  cls_outputs = cls_outputs[:, 1:num_classes]
+  top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
+      tf.reshape(cls_outputs, [-1]),
+      k=pre_nms_num_detections,
+      sorted=False)
+  classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
+  top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)
+
+  anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
+  box_outputs = tf.reshape(
+      box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
+  class_indices = classes
+  box_outputs = tf.gather_nd(box_outputs,
+                             tf.stack([top_k_indices, class_indices], axis=1))
+
+  # apply bounding box regression to anchors
+  boxes = box_utils.decode_boxes(
+      box_outputs, anchor_boxes, bbox_reg_weights)
+  boxes = box_utils.clip_boxes(
+      boxes, image_info[0], image_info[1])
+
+  list_of_all_boxes = []
+  list_of_all_scores = []
+  list_of_all_classes = []
+  # Skip background class.
+  for class_i in range(num_classes):
+    # Compute bitmask for the given classes.
+    class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype)
+    # This works because score is in [0, 1].
+    class_i_scores = top_k_scores * class_i_bitmask
+    # The TPU and CPU have different behaviors for
+    # tf.image.non_max_suppression_padded (b/116754376).
+    (class_i_post_nms_indices,
+     class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
+         tf.to_float(boxes),
+         tf.to_float(class_i_scores),
+         post_nms_num_detections,
+         iou_threshold=nms_threshold,
+         score_threshold=0.05,
+         pad_to_max_output_size=True,
+         name='nms_detections_' + str(class_i))
+    class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
+    class_i_post_nms_scores = tf.gather(class_i_scores,
+                                        class_i_post_nms_indices)
+    mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid])
+    class_i_post_nms_scores = tf.where(
+        mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores))
+    class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i+1)
+    list_of_all_boxes.append(class_i_post_nms_boxes)
+    list_of_all_scores.append(class_i_post_nms_scores)
+    list_of_all_classes.append(class_i_classes)
+
+  post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
+  post_nms_scores = tf.concat(list_of_all_scores, axis=0)
+  post_nms_classes = tf.concat(list_of_all_classes, axis=0)
+
+  # sort all results.
+  post_nms_scores, sorted_indices = tf.nn.top_k(
+      tf.to_float(post_nms_scores),
+      k=post_nms_num_detections,
+      sorted=True)
+  post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
+  post_nms_classes = tf.gather(post_nms_classes, sorted_indices)
+
+  valid_mask = tf.where(
+      tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores),
+      tf.zeros_like(post_nms_scores))
+  num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
+  box_classes = tf.to_float(post_nms_classes)
+  return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
+
+
+def generate_detections_tpu(class_outputs,
+                            box_outputs,
+                            anchor_boxes,
+                            image_info,
+                            pre_nms_num_detections=1000,
+                            post_nms_num_detections=100,
+                            nms_threshold=0.3,
+                            bbox_reg_weights=(10., 10., 5., 5.)):
+  """Generate the final detections given the model outputs (TPU version).
+
+  Args:
+    class_outputs: a tensor with shape [batch_size, N, num_classes], which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+      stacks box regression outputs on all feature levels. The N is the number
+      of total anchors on all levels.
+    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+      on all feature levels. The N is the number of total anchors on all levels.
+    image_info: a tensor of shape [batch_size, 5] which encodes each image's
+      [height, width, scale, original_height, original_width].
+    pre_nms_num_detections: an integer that specifies the number of candidates
+      before NMS.
+    post_nms_num_detections: an integer that specifies the number of candidates
+      after NMS.
+    nms_threshold: a float number to specify the IOU threshold of NMS.
+    bbox_reg_weights: a list of 4 float scalars, which are default weights on
+      (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+  Returns:
+    a tuple of tensors corresponding to number of valid boxes,
+    box coordinates, object categories for each boxes, and box scores stacked
+    in batch_size.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size, _, _ = class_outputs.get_shape().as_list()
+    softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+    num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])
+    for i in range(batch_size):
+      result = generate_detections_per_image_tpu(
+          softmax_class_outputs[i], box_outputs[i], anchor_boxes[i],
+          image_info[i], pre_nms_num_detections, post_nms_num_detections,
+          nms_threshold, bbox_reg_weights)
+
+      num_valid_boxes.append(result[0])
+      box_coordinates.append(result[1])
+      box_classes.append(result[2])
+      box_scores.append(result[3])
+
+    num_valid_boxes = tf.stack(num_valid_boxes)
+    box_coordinates = tf.stack(box_coordinates)
+    box_classes = tf.stack(box_classes)
+    box_scores = tf.stack(box_scores)
+    return num_valid_boxes, box_coordinates, box_classes, box_scores
+
+
+def generate_detections_gpu(class_outputs,
+                            box_outputs,
+                            anchor_boxes,
+                            image_info,
+                            pre_nms_num_detections=1000,
+                            post_nms_num_detections=100,
+                            nms_threshold=0.3,
+                            bbox_reg_weights=(10., 10., 5., 5.)):
+  """Generate the final detections given the model outputs (GPU version).
+
+  Args:
+    class_outputs: a tensor with shape [batch_size, N, num_classes], which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+      stacks box regression outputs on all feature levels. The N is the number
+      of total anchors on all levels.
+    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+      on all feature levels. The N is the number of total anchors on all levels.
+    image_info: a tensor of shape [batch_size, 5] which encodes each image's
+      [height, width, scale, original_height, original_width].
+    pre_nms_num_detections: an integer that specifies the number of candidates
+      before NMS.
+    post_nms_num_detections: an integer that specifies the number of candidates
+      after NMS.
+    nms_threshold: a float number to specify the IOU threshold of NMS.
+    bbox_reg_weights: a list of 4 float scalars, which are default weights on
+      (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+  Returns:
+    a tuple of tensors corresponding to number of valid boxes,
+    box coordinates, object categories for each boxes, and box scores stacked
+    in batch_size.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
+    softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+    # Remove background
+    scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
+    boxes = tf.slice(
+        tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
+        [0, 0, 1, 0], [-1, -1, -1, -1])
+
+    anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
+                    tf.ones([1, 1, num_classes - 1, 1]))
+
+    num_detections = num_boxes * (num_classes - 1)
+
+    boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
+    scores = tf.reshape(scores, [batch_size, num_detections, 1])
+    anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
+
+    # Decode
+    boxes = box_utils.decode_boxes(
+        boxes, anchor_boxes, bbox_reg_weights)
+
+    # Clip boxes
+    height = tf.expand_dims(image_info[:, 0:1], axis=-1)
+    width = tf.expand_dims(image_info[:, 1:2], axis=-1)
+    boxes = box_utils.clip_boxes(boxes, height, width)
+
+    # NMS
+    pre_nms_boxes = box_utils.to_normalized_coordinates(
+        boxes, height, width)
+    pre_nms_boxes = tf.reshape(
+        pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4])
+    pre_nms_scores = tf.reshape(
+        scores, [batch_size, num_boxes, num_classes - 1])
+    (post_nms_boxes, post_nms_scores, post_nms_classes,
+     post_nms_num_valid_boxes) = (
+         tf.image.combined_non_max_suppression(
+             pre_nms_boxes,
+             pre_nms_scores,
+             max_output_size_per_class=pre_nms_num_detections,
+             max_total_size=post_nms_num_detections,
+             iou_threshold=nms_threshold,
+             score_threshold=0.0,
+             pad_per_class=False))
+    post_nms_classes = post_nms_classes + 1
+    post_nms_boxes = box_utils.to_absolute_coordinates(
+        post_nms_boxes, height, width)
+    return (post_nms_num_valid_boxes, post_nms_boxes,
+            tf.to_float(post_nms_classes), post_nms_scores)
diff --git a/models/official/mask_rcnn/postprocess_ops_new.py b/models/official/mask_rcnn/postprocess_ops_new.py
new file mode 100644
index 000000000..4a829ce68
--- /dev/null
+++ b/models/official/mask_rcnn/postprocess_ops_new.py
@@ -0,0 +1,279 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops used to post-process raw detections."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+
+import box_utils
+
+
+def generate_detections_per_image_tpu(cls_outputs,
+                                      anchor_boxes,
+                                      image_info,
+                                      pre_nms_num_detections=1000,
+                                      post_nms_num_detections=100,
+                                      nms_threshold=0.3,
+                                      bbox_reg_weights=(10., 10., 5., 5.)):
+  """Generate the final detections per image given the model outputs.
+
+  Args:
+    cls_outputs: a tensor with shape [N, num_classes], which stacks class
+      logit outputs on all feature levels. The N is the number of total anchors
+      on all levels. The num_classes is the number of classes predicted by the
+      model. Note that the cls_outputs should be the output of softmax().
+    box_outputs: a tensor with shape [N, num_classes*4], which stacks box
+      regression outputs on all feature levels. The N is the number of total
+      anchors on all levels.
+    anchor_boxes: a tensor with shape [N, 4], which stacks anchors on all
+      feature levels. The N is the number of total anchors on all levels.
+    image_info: a tensor of shape [5] which encodes the input image's [height,
+      width, scale, original_height, original_width]
+    pre_nms_num_detections: an integer that specifies the number of candidates
+      before NMS.
+    post_nms_num_detections: an integer that specifies the number of candidates
+      after NMS.
+    nms_threshold: a float number to specify the IOU threshold of NMS.
+    bbox_reg_weights: a list of 4 float scalars, which are default weights on
+      (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+  Returns:
+    detections: Tuple of tensors corresponding to number of valid boxes,
+    box coordinates, object categories for each boxes, and box scores
+    -- respectively.
+  """
+  num_boxes, num_classes = cls_outputs.get_shape().as_list()
+
+  # Remove background class scores.
+  cls_outputs = cls_outputs[:, 1:num_classes]
+  top_k_scores, top_k_indices_with_classes = tf.nn.top_k(
+      tf.reshape(cls_outputs, [-1]),
+      k=pre_nms_num_detections,
+      sorted=False)
+  classes = tf.mod(top_k_indices_with_classes, num_classes - 1)
+  top_k_indices = tf.floordiv(top_k_indices_with_classes, num_classes - 1)
+
+  anchor_boxes = tf.gather(anchor_boxes, top_k_indices)
+  boxes = anchor_boxes
+  # box_outputs = tf.reshape(
+  #     box_outputs, [num_boxes, num_classes, 4])[:, 1:num_classes, :]
+  class_indices = classes
+  # box_outputs = tf.gather_nd(box_outputs,
+  #                            tf.stack([top_k_indices, class_indices], axis=1))
+
+  # apply bounding box regression to anchors
+  # boxes = box_utils.decode_boxes(
+  #     box_outputs, anchor_boxes, bbox_reg_weights)
+  boxes = box_utils.clip_boxes(
+      boxes, image_info[0], image_info[1])
+
+
+  list_of_all_boxes = []
+  list_of_all_scores = []
+  list_of_all_classes = []
+  # Skip background class.
+  for class_i in range(num_classes):
+    # Compute bitmask for the given classes.
+    class_i_bitmask = tf.cast(tf.equal(classes, class_i), top_k_scores.dtype)
+    # This works because score is in [0, 1].
+    class_i_scores = top_k_scores * class_i_bitmask
+    # The TPU and CPU have different behaviors for
+    # tf.image.non_max_suppression_padded (b/116754376).
+    (class_i_post_nms_indices,
+     class_i_nms_num_valid) = tf.image.non_max_suppression_padded(
+         tf.to_float(boxes),
+         tf.to_float(class_i_scores),
+         post_nms_num_detections,
+         iou_threshold=nms_threshold,
+         score_threshold=0.05,
+         pad_to_max_output_size=True,
+         name='nms_detections_' + str(class_i))
+    class_i_post_nms_boxes = tf.gather(boxes, class_i_post_nms_indices)
+    class_i_post_nms_scores = tf.gather(class_i_scores,
+                                        class_i_post_nms_indices)
+    mask = tf.less(tf.range(post_nms_num_detections), [class_i_nms_num_valid])
+    class_i_post_nms_scores = tf.where(
+        mask, class_i_post_nms_scores, tf.zeros_like(class_i_post_nms_scores))
+    class_i_classes = tf.fill(tf.shape(class_i_post_nms_scores), class_i+1)
+    list_of_all_boxes.append(class_i_post_nms_boxes)
+    list_of_all_scores.append(class_i_post_nms_scores)
+    list_of_all_classes.append(class_i_classes)
+
+  post_nms_boxes = tf.concat(list_of_all_boxes, axis=0)
+  post_nms_scores = tf.concat(list_of_all_scores, axis=0)
+  post_nms_classes = tf.concat(list_of_all_classes, axis=0)
+
+  # sort all results.
+  post_nms_scores, sorted_indices = tf.nn.top_k(
+      tf.to_float(post_nms_scores),
+      k=post_nms_num_detections,
+      sorted=True)
+  post_nms_boxes = tf.gather(post_nms_boxes, sorted_indices)
+  post_nms_classes = tf.gather(post_nms_classes, sorted_indices)
+
+  valid_mask = tf.where(
+      tf.greater(post_nms_scores, 0), tf.ones_like(post_nms_scores),
+      tf.zeros_like(post_nms_scores))
+  num_valid_boxes = tf.reduce_sum(valid_mask, axis=-1)
+  box_classes = tf.to_float(post_nms_classes)
+  return num_valid_boxes, post_nms_boxes, box_classes, post_nms_scores
+
+
+def generate_detections_tpu(class_outputs,
+                            anchor_boxes,
+                            image_info,
+                            pre_nms_num_detections=1000,
+                            post_nms_num_detections=100,
+                            nms_threshold=0.3,
+                            bbox_reg_weights=(10., 10., 5., 5.)):
+  """Generate the final detections given the model outputs (TPU version).
+
+  Args:
+    class_outputs: a tensor with shape [batch_size, N, num_classes], which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+      stacks box regression outputs on all feature levels. The N is the number
+      of total anchors on all levels.
+    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+      on all feature levels. The N is the number of total anchors on all levels.
+    image_info: a tensor of shape [batch_size, 5] which encodes each image's
+      [height, width, scale, original_height, original_width].
+    pre_nms_num_detections: an integer that specifies the number of candidates
+      before NMS.
+    post_nms_num_detections: an integer that specifies the number of candidates
+      after NMS.
+    nms_threshold: a float number to specify the IOU threshold of NMS.
+    bbox_reg_weights: a list of 4 float scalars, which are default weights on
+      (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+  Returns:
+    a tuple of tensors corresponding to number of valid boxes,
+    box coordinates, object categories for each boxes, and box scores stacked
+    in batch_size.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size, _, _ = class_outputs.get_shape().as_list()
+    softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+    num_valid_boxes, box_coordinates, box_classes, box_scores = ([], [], [], [])
+    for i in range(batch_size):
+      result = generate_detections_per_image_tpu(
+          softmax_class_outputs[i], anchor_boxes[i],
+          image_info[i], pre_nms_num_detections, post_nms_num_detections,
+          nms_threshold, bbox_reg_weights)
+
+      num_valid_boxes.append(result[0])
+      box_coordinates.append(result[1])
+      box_classes.append(result[2])
+      box_scores.append(result[3])
+
+    num_valid_boxes = tf.stack(num_valid_boxes)
+    box_coordinates = tf.stack(box_coordinates)
+    box_classes = tf.stack(box_classes)
+    box_scores = tf.stack(box_scores)
+    return num_valid_boxes, box_coordinates, box_classes, box_scores
+
+
+def generate_detections_gpu(class_outputs,
+                            anchor_boxes,
+                            image_info,
+                            pre_nms_num_detections=1000,
+                            post_nms_num_detections=100,
+                            nms_threshold=0.3,
+                            bbox_reg_weights=(10., 10., 5., 5.)):
+  """Generate the final detections given the model outputs (GPU version).
+
+  Args:
+    class_outputs: a tensor with shape [batch_size, N, num_classes], which
+      stacks class logit outputs on all feature levels. The N is the number of
+      total anchors on all levels. The num_classes is the number of classes
+      predicted by the model. Note that the class_outputs here is the raw score.
+    box_outputs: a tensor with shape [batch_size, N, num_classes*4], which
+      stacks box regression outputs on all feature levels. The N is the number
+      of total anchors on all levels.
+    anchor_boxes: a tensor with shape [batch_size, N, 4], which stacks anchors
+      on all feature levels. The N is the number of total anchors on all levels.
+    image_info: a tensor of shape [batch_size, 5] which encodes each image's
+      [height, width, scale, original_height, original_width].
+    pre_nms_num_detections: an integer that specifies the number of candidates
+      before NMS.
+    post_nms_num_detections: an integer that specifies the number of candidates
+      after NMS.
+    nms_threshold: a float number to specify the IOU threshold of NMS.
+    bbox_reg_weights: a list of 4 float scalars, which are default weights on
+      (dx, dy, dw, dh) for normalizing bbox regression targets.
+
+  Returns:
+    a tuple of tensors corresponding to number of valid boxes,
+    box coordinates, object categories for each boxes, and box scores stacked
+    in batch_size.
+  """
+  with tf.name_scope('generate_detections'):
+    batch_size, num_boxes, num_classes = class_outputs.get_shape().as_list()
+    softmax_class_outputs = tf.nn.softmax(class_outputs)
+
+    # Remove background
+    scores = tf.slice(softmax_class_outputs, [0, 0, 1], [-1, -1, -1])
+    # boxes = tf.slice(
+    #     tf.reshape(box_outputs, [batch_size, num_boxes, num_classes, 4]),
+    #     [0, 0, 1, 0], [-1, -1, -1, -1])
+
+    anchor_boxes = (tf.expand_dims(anchor_boxes, axis=2) *
+                    tf.ones([1, 1, num_classes - 1, 1]))
+
+    num_detections = num_boxes * (num_classes - 1)
+
+    # boxes = tf.reshape(boxes, [batch_size, num_detections, 4])
+    scores = tf.reshape(scores, [batch_size, num_detections, 1])
+    anchor_boxes = tf.reshape(anchor_boxes, [batch_size, num_detections, 4])
+
+    # Decode
+    # boxes = box_utils.decode_boxes(
+    #     boxes, anchor_boxes, bbox_reg_weights)
+
+    boxes = anchor_boxes
+
+    # Clip boxes
+    height = tf.expand_dims(image_info[:, 0:1], axis=-1)
+    width = tf.expand_dims(image_info[:, 1:2], axis=-1)
+    boxes = box_utils.clip_boxes(boxes, height, width)
+
+    # NMS
+    pre_nms_boxes = box_utils.to_normalized_coordinates(
+        boxes, height, width)
+    pre_nms_boxes = tf.reshape(
+        pre_nms_boxes, [batch_size, num_boxes, num_classes - 1, 4])
+    pre_nms_scores = tf.reshape(
+        scores, [batch_size, num_boxes, num_classes - 1])
+    (post_nms_boxes, post_nms_scores, post_nms_classes,
+     post_nms_num_valid_boxes) = (
+         tf.image.combined_non_max_suppression(
+             pre_nms_boxes,
+             pre_nms_scores,
+             max_output_size_per_class=pre_nms_num_detections,
+             max_total_size=post_nms_num_detections,
+             iou_threshold=nms_threshold,
+             score_threshold=0.0,
+             pad_per_class=False))
+    post_nms_classes = post_nms_classes + 1
+    post_nms_boxes = box_utils.to_absolute_coordinates(
+        post_nms_boxes, height, width)
+    return (post_nms_num_valid_boxes, post_nms_boxes,
+            tf.to_float(post_nms_classes), post_nms_scores)