[RUNTIME][CLML] Profiling options enabled for CLML (BYOC via JSON Run…

…time) Graph debug runtime to modifications to acommodate profiling through BYOC external calls. Updated TVMC interface to add more formats while profile dump. Added CLML helpers that can rebiuild CPP clml sources from profile dumps. CLML runtime profiling is now controlled by runtime profile flag.
apache · Jan 28, 2025 · 15e86e2 · 15e86e2
1 parent 050b23f
commit 15e86e2
Show file tree

Hide file tree

Showing 16 changed files with 537 additions and 103 deletions.
diff --git a/apps/cpp_clml/scripts/clml_codegen_json.py b/apps/cpp_clml/scripts/clml_codegen_json.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+import json
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.driver import tvmc
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from string import Template
+
+
+def main():
+    print("CLML Codegen From JSON")
+    if len(sys.argv) != 3:
+        print("Usage: python clml_codegen_json.py <json path> <outfile path>")
+        return
+
+    with open(sys.argv[1], "r") as file:
+        codegen = json.load(file)
+        (_, gen_src) = clml.CLMLGenSrc(codegen).get_artifacts()
+
+        f_src = open(sys.argv[2], "w")
+        f_src.write("\n".join(gen_src))
+        f_src.close()
+        os.popen("clang-format-15 -i " + sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/apps/cpp_clml/scripts/compare_npy.py b/apps/cpp_clml/scripts/compare_npy.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import numpy as np
+
+
+def main():
+    print("Compare given numpy array in npz files")
+    if len(sys.argv) != 4:
+        print("Usage: python compare_npy.py <npz file 1> <npz file 2> <np array to cpmpare>")
+        return
+
+    in1 = np.load(sys.argv[1])
+    in2 = np.load(sys.argv[2])
+
+    print(sys.argv[1] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
+    print(sys.argv[2] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
+
+    np.testing.assert_allclose(in1[sys.argv[3]], in2[sys.argv[3]], rtol=1e-5, atol=1e-5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
@@ -17,6 +17,7 @@
 """Graph debug runtime executes TVM debug packed functions."""
 
 import logging
+import json
 import os
 import shutil
 import struct
@@ -117,6 +118,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._run_individual_node = module["run_individual_node"]
         self._debug_get_output = module["debug_get_output"]
         self._execute_node = module["execute_node"]
+        self._debug_run_ext_compiler = module["debug_run_ext_compiler"]
         self._get_node_output = module["get_node_output"]
         self._profile = module["profile"]
         self._profile_rpc = module["profile_rpc"]
@@ -223,6 +225,14 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
+    def _run_external_debug(self):
+        ext_trace = self._debug_run_ext_compiler()
+        ext_json = json.loads(ext_trace)
+        for op in ext_json:
+            ext_debug = tvm.get_global_func("runtime.ext.debug." + op["compiler"], True)
+            if isinstance(ext_debug, tvm.runtime.packed_func.PackedFunc):
+                ext_debug(op["op"], op["dump"], self._dump_path)
+
     def _run_debug(
         self,
         number,
@@ -249,6 +259,9 @@ def _run_debug(
         # Get outputs.
         self._run_per_layer()
 
+        # Run external compiler debug if supported
+        self._run_external_debug()
+
     def debug_get_output(self, node, out=None):
         """Run graph up to node and get the output to out
 

diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
@@ -150,6 +150,10 @@ def dump_output_tensor(self):
         self._cleanup_tensors()
         output_tensors = self.get_output_tensors()
 
+        np_tensors = {}
+        for key, val in output_tensors.items():
+            np_tensors[key] = val.asnumpy()
+        np.savez(os.path.join(self._dump_path, "output_tensors.npz"), **np_tensors)
         with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f:
             param_f.write(save_tensors(output_tensors))
 

diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
@@ -91,6 +91,12 @@ def add_run_parser(subparsers, main_parser, json_params):  # pylint: disable=unu
         "Profiling may also have an impact on inference time, "
         "making it take longer to be generated.",
     )
+    parser.add_argument(
+        "--profile-options",
+        default="table,sort,aggregate,col_sums",
+        help="Additional options for profiling. Table dump is default"
+        "comma seperated string of table,csv,json,sort,aggregate,col_sums",
+    )
     parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.")
     parser.add_argument(
         "--end-to-end",
@@ -170,6 +176,7 @@ def drive_run(args):
         repeat=args.repeat,
         number=args.number,
         profile=args.profile,
+        profile_options=args.profile_options,
         end_to_end=args.end_to_end,
     )
 
@@ -359,6 +366,7 @@ def run_module(
     repeat: int = 10,
     number: int = 10,
     profile: bool = False,
+    profile_options: str = "table,sort,aggregate,col_sums",
     end_to_end: bool = False,
 ):
     """Run a compiled graph executor module locally or remotely with
@@ -398,6 +406,8 @@ def run_module(
         Requires `benchmark` to be set to True.
     profile : bool
         Whether to profile the run with the debug executor.
+    profile_options : string
+        Additional options for profiling
     end_to_end : bool
         Whether to measure the time of memory copies as well as model
         execution. Turning this on can provide a more realistic estimate
@@ -533,7 +543,15 @@ def run_module(
                 logger.info("Running the module with profiling enabled.")
                 report = module.profile()
                 # This print is intentional
-                print(report)
+                if profile_options.find("table") != -1:
+                    is_sort = profile_options.find("sort") != -1
+                    is_aggr = profile_options.find("aggregate") != -1
+                    is_sum = profile_options.find("col_sums") != -1
+                    print(report.table(sort=is_sort, aggregate=is_aggr, col_sums=is_sum))
+                if profile_options.find("csv") != -1:
+                    print(report.csv())
+                if profile_options.find("json") != -1:
+                    print(report.json())
 
             if not benchmark or device == "micro":
                 # TODO(gromero): Fix time_evaluator() for micro targets. Once it's