Export the model with torch.no_grad() context (#6472)

baijumeswani · codemzs · commit 97feafa45ec5 · 2021-02-14T02:12:20.000Z
diff --git a/orttraining/orttraining/python/training/ortmodule.py b/orttraining/orttraining/python/training/ortmodule.py
@@ -457,14 +457,15 @@ def _get_forward_graph(self, input_names, dynamic_axes, *inputs, **kwargs):
         # from onnxruntime.training import register_custom_ops_pytorch_exporter
         # register_custom_ops_pytorch_exporter.register_custom_op()
 
-        # Export torch.nn.Module to ONNX
-        torch.onnx.export(self._original_module,
-                          sample_inputs_copy,
-                          f,
-                          input_names=input_names,
-                          opset_version=ONNX_OPSET_VERSION,
-                          do_constant_folding=False,
-                          training=torch.onnx.TrainingMode.TRAINING,
-                          dynamic_axes=dynamic_axes)
+        with torch.no_grad():
+            # Export torch.nn.Module to ONNX
+            torch.onnx.export(self._original_module,
+                              sample_inputs_copy,
+                              f,
+                              input_names=input_names,
+                              opset_version=ONNX_OPSET_VERSION,
+                              do_constant_folding=False,
+                              training=torch.onnx.TrainingMode.TRAINING,
+                              dynamic_axes=dynamic_axes)
 
         return onnx.load_model_from_string(f.getvalue())
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -3,7 +3,9 @@
 # orttraining_test_ortmodule_api.py
 
 import torch
+from transformers import AutoConfig, BertForSequenceClassification
 import pytest
+from unittest.mock import patch
 
 import onnxruntime
 from onnxruntime.training import ORTModule
@@ -84,6 +86,33 @@ def forward(self, model_input, x=None, y=None, z=None):
         out = self.fc2(out)
         return out
 
+def _get_bert_for_sequence_classification_model(device):
+    """Returns the BertForSequenceClassification pretrained model"""
+
+    config = AutoConfig.from_pretrained(
+            "bert-base-uncased",
+            num_labels=2,
+            num_hidden_layers=1,
+            output_attentions = False,
+            output_hidden_states = False,
+    )
+
+    model = BertForSequenceClassification.from_pretrained(
+        "bert-base-uncased",
+        config=config,
+    ).to(device)
+
+    return model
+
+def _get_bert_for_sequence_classification_sample_data(device):
+    """Returns sample data to be used with BertForSequenceClassification model"""
+
+    input_ids = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
+    input_mask = torch.randint(0, 100, (32, 64), dtype=torch.long, device=device)
+    labels = torch.randint(0, 1, (32,), dtype=torch.long, device=device)
+
+    return input_ids, input_mask, labels
+
 # ORTModule-API tests
 
 def test_forward_call_single_positional_argument():
@@ -286,3 +315,33 @@ def test_changes_input_requires_grad_reinitializes_module_gradient_graph_builder
     module_gradient_graph_builder = model._module_gradient_graph_builder
     model(x)
     assert module_gradient_graph_builder != model._module_gradient_graph_builder
+
+def test_gpu_reserved_memory_with_torch_no_grad():
+    device = 'cuda'
+
+    # Create a model and get the memory_reserved when torch.no_grad has been enabled
+    # before and after export
+    model_with_no_grad = _get_bert_for_sequence_classification_model(device)
+    x, y, z = _get_bert_for_sequence_classification_sample_data(device)
+
+    torch.cuda.empty_cache()
+    model_with_no_grad = ORTModule(model_with_no_grad)
+    mem_reserved_before_export = torch.cuda.memory_reserved(device)
+    model_with_no_grad(x, y, None, None, None, None, z)
+    mem_reserved_after_export_with_torch_no_grad = torch.cuda.memory_reserved(device)
+    del model_with_no_grad
+    torch.cuda.empty_cache()
+    mem_reserved_after_cache_empty = torch.cuda.memory_reserved(device)
+    assert mem_reserved_before_export == mem_reserved_after_cache_empty
+
+    # Create another model and get the memory_reserved when torch.no_grad has not been enabled
+    # after export
+    model_without_no_grad = _get_bert_for_sequence_classification_model(device)
+    model_without_no_grad = ORTModule(model_without_no_grad)
+    mem_reserved_after_export_without_torch_no_grad = 0
+    with patch('torch.no_grad'):
+        model_without_no_grad(x, y, None, None, None, None, z)
+        mem_reserved_after_export_without_torch_no_grad = torch.cuda.memory_reserved(device)
+
+    assert mem_reserved_after_export_with_torch_no_grad < mem_reserved_after_export_without_torch_no_grad
+    assert mem_reserved_before_export < mem_reserved_after_export_with_torch_no_grad