Clean up profiling mode and profiling executor strategy (pytorch#73875)

Elias Ellison · pytorchmergebot · commit 6694fdaccda6 · 2022-03-29T18:38:51.000Z
Summary: Pull Request resolved: pytorch#73875 Previously we had a few settings: - getExecutor - which toggled between Profiling Executor and Legacy - getGraphOptimize - if true, overrides PE/Legacy to run with simple executor (no optimizations) and then... - getProfilingMode - which would set PE to 0 specializtions. The last mode is redundant with getGraphOptimize, we should just remove it and use getGraphOptimize in these cases. It would lead to potentially invalid combinations of logic - what does mean if getProfilingMode is true but getExecutor is set to false ? This would lead to a bug in specialize_autograd_zero in this case, see: https://github.com/pytorch/pytorch/blob/master/torch%2Fcsrc%2Fjit%2Fpasses%2Fspecialize_autogradzero.cpp#L93. The tests here are failing but get fixed with the PR above it, so i'll squash for landing. Test Plan: Imported from OSS Reviewed By: cpuhrsch Differential Revision: D34938130 Pulled By: eellison fbshipit-source-id: 1a9c0ae7f6d1cfddc2ed3499a5af611053ae5e1b (cherry picked from commit cf69ce3)
diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h
@@ -62,7 +62,7 @@ struct BuiltinOpFunction : public Function {
     return *this;
   }
 
-  bool call(Stack& stack, size_t, c10::function_ref<void(const Code&)>) override {
+  bool call(Stack& stack, c10::optional<size_t>, c10::function_ref<void(const Code&)>) override {
     run(stack);
     return false;
   }
diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h
@@ -90,7 +90,7 @@ struct TORCH_API Function {
   // call() returns false.
 
   // Overload for server interpreter, a bailout size is needed for graph executor.
-  virtual bool call(Stack&, size_t, c10::function_ref<void(const Code&)>) {
+  virtual bool call(Stack&, c10::optional<size_t>, c10::function_ref<void(const Code&)>) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);
     return false;
   }
diff --git a/benchmarks/fastrnns/fuser.py b/benchmarks/fastrnns/fuser.py
@@ -4,18 +4,18 @@ def set_fuser(fuser_name, executor_name):
     assert fuser_name in ['te', 'old', 'none', 'default']
     if fuser_name == 'te':
         torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
     elif fuser_name == 'old':
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._jit_set_texpr_fuser_enabled(False)
     elif fuser_name == 'none':
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
         torch._C._jit_override_can_fuse_on_gpu(False)
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
@@ -25,12 +25,11 @@ def set_fuser(fuser_name, executor_name):
     # --executor overrides settings of --fuser
     if executor_name == 'profiling':
         torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
     elif executor_name == 'simple':
-        torch._C._jit_set_profiling_executor(True)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(False)
     elif executor_name == 'legacy':
         torch._C._jit_set_profiling_executor(False)
-        torch._C._jit_set_profiling_mode(False)
+        torch._C._get_graph_executor_optimize(True)
     elif executor_name == 'default':
         pass
diff --git a/benchmarks/tensorexpr/__main__.py b/benchmarks/tensorexpr/__main__.py
@@ -137,7 +137,7 @@ def main():
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
     elif args.cuda_fuser == "old":
         import torch
         torch._C._jit_set_profiling_executor(False)
@@ -148,7 +148,7 @@ def main():
         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(True)
-        torch._C._jit_set_profiling_mode(True)
+        torch._C._get_graph_executor_optimize(True)
     else :
         raise ValueError("Undefined fuser: {}".format(args.cuda_fuser))
 
diff --git a/test/cpp/jit/test_autodiff.cpp b/test/cpp/jit/test_autodiff.cpp
@@ -289,14 +289,11 @@ class AutodiffRemoveUnusedGradientsTest : public ::testing::Test {
   void SetUp() override {
     prev_exec = getExecutorMode();
     getExecutorMode() = true;
-    prev_profiling = getProfilingMode();
-    getProfilingMode() = true;
     prev_inline_autodiff = getAutodiffSubgraphInlining();
     debugSetAutodiffSubgraphInlining(false);
   }
   void TearDown() override {
     getExecutorMode() = prev_exec;
-    getProfilingMode() = prev_profiling;
     debugSetAutodiffSubgraphInlining(prev_inline_autodiff);
   }
 
diff --git a/test/jit/test_profiler.py b/test/jit/test_profiler.py
@@ -18,7 +18,7 @@
 class TestProfiler(JitTestCase):
     def setUp(self):
         self.prev_exec = torch._C._jit_set_profiling_executor(True)
-        self.prev_profiling = torch._C._jit_set_profiling_mode(True)
+        self.prev_profiling = torch._C._get_graph_executor_optimize(True)
         self.inline_autodiff = torch._C._debug_set_autodiff_subgraph_inlining(False)
         self.texpr_fuser_state = torch._C._jit_texpr_fuser_enabled()
         self.can_fuse_on_cpu = torch._C._jit_can_fuse_on_cpu()
@@ -34,7 +34,7 @@ def setUp(self):
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.prev_exec)
-        torch._C._jit_set_profiling_mode(self.prev_profiling)
+        torch._C._get_graph_executor_optimize(self.prev_profiling)
         torch._C._debug_set_autodiff_subgraph_inlining(self.inline_autodiff)
         torch._C._jit_set_texpr_fuser_enabled(self.texpr_fuser_state)
         torch._C._jit_override_can_fuse_on_cpu(self.can_fuse_on_cpu)
diff --git a/test/test_jit.py b/test/test_jit.py
@@ -204,11 +204,6 @@ def doAutodiffCheck(testname):
 # TODO: enable TE in PE when all tests are fixed
 torch._C._jit_set_texpr_fuser_enabled(GRAPH_EXECUTOR == ProfilingMode.PROFILING)
 torch._C._jit_set_profiling_executor(GRAPH_EXECUTOR != ProfilingMode.LEGACY)
-# even though FULL_PROFILER should be our default
-# we haven't tested every single test in this file
-# but we enable FULL_PROFILER for a large subset
-# of the tests with "with enable_profiling_mode_for_profiling_tests"
-torch._C._jit_set_profiling_mode(False)
 
 def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
     hx, cx = hidden
@@ -7360,7 +7355,7 @@ def test_as_tensor_tensor_input(input):
             g = test_as_tensor_tensor_input.graph_for(torch.ones(3, 4))
             FileCheck().check("Tensor = aten::as_tensor").check("Float(*, *, requires_grad=0, device=cpu) = aten::as_tensor").run(g)
 
-
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.LEGACY, "testing legacy behavior")
     def test_tensor_requires_grad(self):
         @torch.jit.script
         def test(b):
diff --git a/test/test_jit_fuser_te.py b/test/test_jit_fuser_te.py
@@ -18,7 +18,7 @@
 # inferred erroneously runs or skips
 # some tests
 torch._C._jit_set_profiling_executor(True)
-torch._C._jit_set_profiling_mode(True)
+torch._C._get_graph_executor_optimize(True)
 
 from torch.testing._internal.common_utils import run_tests, ProfilingMode, GRAPH_EXECUTOR, \
     enable_profiling_mode_for_profiling_tests, slowTest
@@ -2608,7 +2608,7 @@ def setUp(self):
         torch._C._jit_override_can_fuse_on_gpu(True)
 
         self.old_profiling_executor = torch._C._jit_set_profiling_executor(True)
-        self.old_profiling_mode = torch._C._jit_set_profiling_mode(True)
+        self.old_profiling_mode = torch._C._get_graph_executor_optimize(True)
 
         self.old_fusion_inlining = torch._C._debug_get_fusion_group_inlining()
         torch._C._debug_set_fusion_group_inlining(False)
@@ -2625,7 +2625,7 @@ def setUp(self):
 
     def tearDown(self):
         torch._C._jit_set_profiling_executor(self.old_profiling_executor)
-        torch._C._jit_set_profiling_mode(self.old_profiling_mode)
+        torch._C._get_graph_executor_optimize(self.old_profiling_mode)
 
         torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuser_state)
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuser_state)
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -283,7 +283,7 @@ def _get_model_ops_and_info_from_buffer(buffer: BinaryIO): ...
 def _get_mobile_model_contained_types(filename: Union[str, Path]): ...
 def _get_mobile_model_contained_types_from_buffer(buffer: BinaryIO): ...
 def _logging_set_logger(logger: LoggerBase) -> LoggerBase: ...
-def _get_graph_executor_optimize() -> _bool: ...
+def _get_graph_executor_optimize(optimize: Optional[_bool] = None) -> _bool: ...
 def _set_graph_executor_optimize(optimize: _bool): ...
 def _export_opnames(module: ScriptModule) -> List[str]: ...
 def _create_function_from_trace(
diff --git a/torch/csrc/jit/api/function_impl.h b/torch/csrc/jit/api/function_impl.h
@@ -99,7 +99,7 @@ struct TORCH_API GraphFunction : public Function {
   using Function::call;
   bool call(
       Stack& stack,
-      size_t bailOut,
+      c10::optional<size_t> bailOut,
       c10::function_ref<void(const Code&)> f) override {
     f(get_executor().getPlanFor(stack, bailOut).code);
     return true;
diff --git a/torch/csrc/jit/passes/specialize_autogradzero.cpp b/torch/csrc/jit/passes/specialize_autogradzero.cpp
@@ -90,7 +90,7 @@ struct AutogradZeroSpecializer {
     if (!isBackwardGraph()) {
       return;
     }
-    if (getProfilingMode()) {
+    if (getExecutorMode()) {
       if (auto versioning_if = guardSpecializations()) {
         specializeAutogradOps(versioning_if->blocks()[0]);
         GRAPH_DUMP("After versioning graph", graph_);
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
@@ -2009,7 +2009,16 @@ void initJitScriptBindings(PyObject* module) {
     setGraphExecutorOptimize(optimize);
   });
 
-  m.def("_get_graph_executor_optimize", &torch::jit::getGraphExecutorOptimize);
+  m.def(
+      "_get_graph_executor_optimize",
+      [](c10::optional<bool> new_setting = c10::nullopt) {
+        bool old_value = getGraphExecutorOptimize();
+        if (new_setting) {
+          setGraphExecutorOptimize(*new_setting);
+        }
+        return old_value;
+      },
+      py::arg("new_settings") = nullptr);
 
   m.def(
       "_enable_mobile_interface_call_export",
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -42,6 +42,7 @@
 
 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function.h>
+#include <torch/csrc/jit/python/update_graph_executor_opt.h>
 #include <torch/csrc/jit/runtime/logging.h>
 
 #include <cstdint>
@@ -56,17 +57,16 @@ namespace torch {
 namespace jit {
 
 EnableProfilingGuard::EnableProfilingGuard() {
-  auto& profiling_mode = getProfilingMode();
-  old_profiling_mode = profiling_mode;
-  profiling_mode = true;
   auto& executor_mode = getExecutorMode();
   old_executor_mode = executor_mode;
   executor_mode = true;
+  old_get_optimize = getGraphExecutorOptimize();
+  setGraphExecutorOptimize(true);
 }
 
 EnableProfilingGuard::~EnableProfilingGuard() {
-  getProfilingMode() = old_profiling_mode;
   getExecutorMode() = old_executor_mode;
+  setGraphExecutorOptimize(old_get_optimize);
 }
 
 namespace {
@@ -408,8 +408,7 @@ struct DifferentiableGraphOp {
 
     detachVariables(stack);
     if (IsNewExecutorEnabled()) {
-      const ExecutionPlan& plan =
-          f_ptr->getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts());
+      const ExecutionPlan& plan = f_ptr->getPlanFor(stack);
       InterpreterState(plan.code).run(stack);
     } else {
       InterpreterState(legacy_f).run(stack);
@@ -550,8 +549,7 @@ void GraphExecutorImplBase::run(Stack& stack) {
   logging::getLogger()->addStatValue(
       logging::runtime_counters::GRAPH_EXECUTOR_INVOCATIONS, 1.0);
 
-  const ExecutionPlan& plan =
-      getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts());
+  const ExecutionPlan& plan = getPlanFor(stack);
   InterpreterState(plan.code).run(stack);
   last_executed_optimized_graph = plan.graph;
 }
@@ -576,9 +574,8 @@ c10::intrusive_ptr<Future> GraphExecutorImplBase::runAsync(
     ExecutionPlan plan;
     InterpreterState state;
   };
-  auto frame = std::make_shared<Frame>(
-      getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts()),
-      std::move(taskLauncher));
+  auto frame =
+      std::make_shared<Frame>(getPlanFor(stack), std::move(taskLauncher));
   auto res = frame->state.runAsync(stack);
   last_executed_optimized_graph = frame->plan.graph;
   if (!res->completed()) {
@@ -603,8 +600,9 @@ struct GraphExecutorImpl : public GraphExecutorImplBase {
         logging::runtime_counters::GRAPH_EXECUTORS_CONSTRUCTED, 1.0);
   }
 
-  const ExecutionPlan& getPlanFor(Stack& stack, size_t remaining_bailout_depth)
-      override {
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      c10::optional<size_t> remaining_bailout_depth) override {
     return getGraphExecutorOptimize() ? getOrCompile(stack)
                                       : getOrCompileFallback();
   }
@@ -783,13 +781,9 @@ c10::intrusive_ptr<Future> GraphExecutor::runAsync(
   return pImpl->runAsync(stack, std::move(taskLauncher));
 }
 
-size_t GraphExecutor::getDefaultNumBailOuts() {
-  return getProfilingMode() ? getBailoutDepth() : 0;
-}
-
 const ExecutionPlan& GraphExecutor::getPlanFor(
     Stack& inputs,
-    size_t remaining_bailout_depth) {
+    c10::optional<size_t> remaining_bailout_depth) {
   return pImpl->getPlanFor(inputs, remaining_bailout_depth);
 }
 
@@ -887,10 +881,8 @@ void runNondiffOptimization(
 
   // decomposition pass, decompose certain ops that will be used in the
   // following passes (like batchmm and jit fusion)
-  if (!getProfilingMode()) {
-    DecomposeOps(graph);
-    GRAPH_DEBUG("After DecomposeOps\n", *graph);
-  }
+  DecomposeOps(graph);
+  GRAPH_DEBUG("After DecomposeOps\n", *graph);
 
   // TupleConstruct / TupleUnpack pairs can still be present at this point
   // and must be removed for fusion.
@@ -901,7 +893,7 @@ void runNondiffOptimization(
   BatchMM(graph);
 
   GRAPH_DEBUG("After BatchMM, before Fusion\n", *graph);
-  if (getProfilingMode()) {
+  if (getExecutorMode()) {
     if (tensorExprFuserEnabled()) {
       auto min_size = getFusionGroupInlining() ? 2 : 1;
       auto dyn_shapes = tensorExprDynamicShapeFusionEnabled();
diff --git a/torch/csrc/jit/runtime/graph_executor.h b/torch/csrc/jit/runtime/graph_executor.h
@@ -18,12 +18,8 @@ struct Code;
 
 struct ExecutionPlan {
   ExecutionPlan() = default;
-  ExecutionPlan(
-      std::shared_ptr<Graph> graph,
-      std::string function_name,
-      size_t remaining_bailout_depth = 0)
-      : code(graph, std::move(function_name), remaining_bailout_depth),
-        graph(std::move(graph)) {}
+  ExecutionPlan(std::shared_ptr<Graph> graph, std::string function_name)
+      : code(graph, std::move(function_name)), graph(std::move(graph)) {}
 
   operator bool() const {
     return static_cast<bool>(graph);
@@ -34,8 +30,8 @@ struct ExecutionPlan {
 };
 
 // Notice that those structs don't manage lifetime of their members.
-// They is only valid only right after you call getDebugState() and should never
-// be used again once another GraphExecutor function is called.
+// They are only valid only right after you call getDebugState() and should
+// never be used again once another GraphExecutor function is called.
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
 struct GraphExecutorState {
@@ -50,7 +46,7 @@ struct TORCH_API EnableProfilingGuard {
 
  private:
   bool old_executor_mode = false;
-  bool old_profiling_mode = false;
+  bool old_get_optimize = false;
 };
 
 struct GraphExecutorImplBase;
@@ -72,13 +68,13 @@ struct TORCH_API GraphExecutor {
   // profiled information whenever a bailout check is failed/triggered, a new
   // `GraphExecutor` will be created. This new `GraphExecutor`'s
   // remaining_bailout_depth will be reduced by 1.
+  // If no bailout depth is passed, the depth will be initialized from the
+  // current global fusion strategy settings.
   const ExecutionPlan& getPlanFor(
       Stack& inputs,
-      size_t remaining_bailout_depth);
+      c10::optional<size_t> remaining_bailout_depth = c10::nullopt);
   GraphExecutorState getDebugState();
 
-  static size_t getDefaultNumBailOuts();
-
   void debugFlushCompilationCache();
 
   bool isOptimized() const;
diff --git a/torch/csrc/jit/runtime/graph_executor_impl.h b/torch/csrc/jit/runtime/graph_executor_impl.h
@@ -79,7 +79,7 @@ struct GraphExecutorImplBase {
 
   virtual const ExecutionPlan& getPlanFor(
       Stack& stack,
-      size_t remaining_bailout_depth) = 0;
+      c10::optional<size_t> remaining_bailout_depth = c10::nullopt) = 0;
   virtual GraphExecutorState getDebugState() = 0;
   virtual ~GraphExecutorImplBase() = default;
 
diff --git a/torch/csrc/jit/runtime/interpreter.cpp b/torch/csrc/jit/runtime/interpreter.cpp
@@ -175,7 +175,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
   void callFunction(
       Function& f,
       Stack& stack,
-      size_t bailOut = GraphExecutor::getDefaultNumBailOuts(),
+      c10::optional<size_t> bailOut = c10::nullopt,
       bool next = true) {
     bool newFrame = f.call(stack, bailOut, [&](const Code& code) {
       enterFrame(code, stack.size() - code.num_inputs());
@@ -716,10 +716,7 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
             auto& forked_fn =
                 toGraphFunction(*frame.function->function_table_[inst.X]);
             InterpreterState forked_interpreter(
-                forked_fn.get_executor()
-                    .getPlanFor(stack, GraphExecutor::getDefaultNumBailOuts())
-                    .code,
-                taskLauncher_);
+                forked_fn.get_executor().getPlanFor(stack).code, taskLauncher_);
             InterpreterContinuation continuation(
                 forked_interpreter,
                 Stack(stack.end() - inst.N, stack.end()),
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
diff --git a/torch/jit/_fuser.py b/torch/jit/_fuser.py
diff --git a/torch/testing/_internal/codegen/random_topo_test.py b/torch/testing/_internal/codegen/random_topo_test.py
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
diff --git a/torch/testing/_internal/jit_utils.py b/torch/testing/_internal/jit_utils.py

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ struct BuiltinOpFunction : public Function {`
`62`	`62`	`return *this;`
`63`	`63`	`}`
`64`	`64`
`65`		`- bool call(Stack& stack, size_t, c10::function_ref<void(const Code&)>) override {`
	`65`	`+ bool call(Stack& stack, c10::optional<size_t>, c10::function_ref<void(const Code&)>) override {`
`66`	`66`	`run(stack);`
`67`	`67`	`return false;`
`68`	`68`	`}`
Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ struct TORCH_API Function {`
`90`	`90`	`// call() returns false.`
`91`	`91`
`92`	`92`	`// Overload for server interpreter, a bailout size is needed for graph executor.`
`93`		`- virtual bool call(Stack&, size_t, c10::function_ref<void(const Code&)>) {`
	`93`	`+ virtual bool call(Stack&, c10::optional<size_t>, c10::function_ref<void(const Code&)>) {`
`94`	`94`	`TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false);`
`95`	`95`	`return false;`
`96`	`96`	`}`
Original file line number	Diff line number	Diff line change
`@@ -289,14 +289,11 @@ class AutodiffRemoveUnusedGradientsTest : public ::testing::Test {`
`289`	`289`	`void SetUp() override {`
`290`	`290`	`prev_exec = getExecutorMode();`
`291`	`291`	`getExecutorMode() = true;`
`292`		`- prev_profiling = getProfilingMode();`
`293`		`- getProfilingMode() = true;`
`294`	`292`	`prev_inline_autodiff = getAutodiffSubgraphInlining();`
`295`	`293`	`debugSetAutodiffSubgraphInlining(false);`
`296`	`294`	`}`
`297`	`295`	`void TearDown() override {`
`298`	`296`	`getExecutorMode() = prev_exec;`
`299`		`- getProfilingMode() = prev_profiling;`
`300`	`297`	`debugSetAutodiffSubgraphInlining(prev_inline_autodiff);`
`301`	`298`	`}`
`302`	`299`
Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ struct AutogradZeroSpecializer {`
`90`	`90`	`if (!isBackwardGraph()) {`
`91`	`91`	`return;`
`92`	`92`	`}`
`93`		`- if (getProfilingMode()) {`
	`93`	`+ if (getExecutorMode()) {`
`94`	`94`	`if (auto versioning_if = guardSpecializations()) {`
`95`	`95`	`specializeAutogradOps(versioning_if->blocks()[0]);`
`96`	`96`	`GRAPH_DUMP("After versioning graph", graph_);`