diff --git a/apps/cpp_clml/scripts/clml_codegen_json.py b/apps/cpp_clml/scripts/clml_codegen_json.py new file mode 100644 index 000000000000..c3fbf835d8ee --- /dev/null +++ b/apps/cpp_clml/scripts/clml_codegen_json.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import sys +import json +import numpy as np + +import tvm +from tvm import relay +from tvm.driver import tvmc +from tvm.relay.op.contrib import clml +from tvm.contrib import utils +from string import Template + + +def main(): + print("CLML Codegen From JSON") + if len(sys.argv) != 3: + print("Usage: python clml_codegen_json.py ") + return + + with open(sys.argv[1], "r") as file: + codegen = json.load(file) + (_, gen_src) = clml.CLMLGenSrc(codegen).get_artifacts() + + f_src = open(sys.argv[2], "w") + f_src.write("\n".join(gen_src)) + f_src.close() + os.popen("clang-format-15 -i " + sys.argv[2]) + + +if __name__ == "__main__": + main() diff --git a/apps/cpp_clml/scripts/compare_npy.py b/apps/cpp_clml/scripts/compare_npy.py new file mode 100644 index 000000000000..8e3c3a8b630f --- /dev/null +++ b/apps/cpp_clml/scripts/compare_npy.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +import numpy as np + + +def main(): + print("Compare given numpy array in npz files") + if len(sys.argv) != 4: + print("Usage: python compare_npy.py ") + return + + in1 = np.load(sys.argv[1]) + in2 = np.load(sys.argv[2]) + + print(sys.argv[1] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape) + print(sys.argv[2] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape) + + np.testing.assert_allclose(in1[sys.argv[3]], in2[sys.argv[3]], rtol=1e-5, atol=1e-5) + + +if __name__ == "__main__": + main() diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py index 785959ce8dd7..b0bd46c123b7 100644 --- a/python/tvm/contrib/debugger/debug_executor.py +++ b/python/tvm/contrib/debugger/debug_executor.py @@ -17,6 +17,7 @@ """Graph debug runtime executes TVM debug packed functions.""" import logging +import json import os import shutil import struct @@ -117,6 +118,7 @@ def __init__(self, module, device, graph_json_str, dump_root): self._run_individual_node = module["run_individual_node"] self._debug_get_output = module["debug_get_output"] self._execute_node = module["execute_node"] + self._debug_run_ext_compiler = module["debug_run_ext_compiler"] self._get_node_output = module["get_node_output"] self._profile = module["profile"] self._profile_rpc = module["profile_rpc"] @@ -223,6 +225,14 @@ def _run_per_layer(self): output_tensors.append(self._get_node_output(i, j)) self.debug_datum.update_output_tensors(output_tensors) + def _run_external_debug(self): + ext_trace = self._debug_run_ext_compiler() + ext_json = json.loads(ext_trace) + for op in ext_json: + ext_debug = tvm.get_global_func("runtime.ext.debug." + op["compiler"], True) + if isinstance(ext_debug, tvm.runtime.packed_func.PackedFunc): + ext_debug(op["op"], op["dump"], self._dump_path) + def _run_debug( self, number, @@ -249,6 +259,9 @@ def _run_debug( # Get outputs. self._run_per_layer() + # Run external compiler debug if supported + self._run_external_debug() + def debug_get_output(self, node, out=None): """Run graph up to node and get the output to out diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py index 45caf41e7e58..946afd8a0be3 100644 --- a/python/tvm/contrib/debugger/debug_result.py +++ b/python/tvm/contrib/debugger/debug_result.py @@ -150,6 +150,10 @@ def dump_output_tensor(self): self._cleanup_tensors() output_tensors = self.get_output_tensors() + np_tensors = {} + for key, val in output_tensors.items(): + np_tensors[key] = val.asnumpy() + np.savez(os.path.join(self._dump_path, "output_tensors.npz"), **np_tensors) with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f: param_f.write(save_tensors(output_tensors)) diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py index 1394936b0a57..4c47a56147b6 100644 --- a/python/tvm/driver/tvmc/runner.py +++ b/python/tvm/driver/tvmc/runner.py @@ -91,6 +91,12 @@ def add_run_parser(subparsers, main_parser, json_params): # pylint: disable=unu "Profiling may also have an impact on inference time, " "making it take longer to be generated.", ) + parser.add_argument( + "--profile-options", + default="table,sort,aggregate,col_sums", + help="Additional options for profiling. Table dump is default" + "comma seperated string of table,csv,json,sort,aggregate,col_sums", + ) parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.") parser.add_argument( "--end-to-end", @@ -170,6 +176,7 @@ def drive_run(args): repeat=args.repeat, number=args.number, profile=args.profile, + profile_options=args.profile_options, end_to_end=args.end_to_end, ) @@ -359,6 +366,7 @@ def run_module( repeat: int = 10, number: int = 10, profile: bool = False, + profile_options: str = "table,sort,aggregate,col_sums", end_to_end: bool = False, ): """Run a compiled graph executor module locally or remotely with @@ -398,6 +406,8 @@ def run_module( Requires `benchmark` to be set to True. profile : bool Whether to profile the run with the debug executor. + profile_options : string + Additional options for profiling end_to_end : bool Whether to measure the time of memory copies as well as model execution. Turning this on can provide a more realistic estimate @@ -533,7 +543,15 @@ def run_module( logger.info("Running the module with profiling enabled.") report = module.profile() # This print is intentional - print(report) + if profile_options.find("table") != -1: + is_sort = profile_options.find("sort") != -1 + is_aggr = profile_options.find("aggregate") != -1 + is_sum = profile_options.find("col_sums") != -1 + print(report.table(sort=is_sort, aggregate=is_aggr, col_sums=is_sum)) + if profile_options.find("csv") != -1: + print(report.csv()) + if profile_options.find("json") != -1: + print(report.json()) if not benchmark or device == "micro": # TODO(gromero): Fix time_evaluator() for micro targets. Once it's diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py index dace7aaab913..6ee303891cd3 100644 --- a/python/tvm/relay/op/contrib/clml.py +++ b/python/tvm/relay/op/contrib/clml.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name, unused-argument, pointless-exception-statement. """CLML Library supported operators.""" import json +import os from string import Template import numpy as np import tvm @@ -29,6 +30,7 @@ from tvm.relay import function as _function from tvm.relay.expr_functor import ExprMutator from tvm.relay.expr import Call, TupleGetItem, Var, Constant +from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple from .register import register_pattern_table @@ -159,6 +161,13 @@ def partition_for_clml(mod, params=None, **opts): if params: mod["main"] = bind_params_by_name(mod["main"], params) + pass_context = tvm.get_global_func("transform.GetCurrentPassContext")() + target_version = ( + pass_context.config["relay.ext.clml.target_version"] + if "relay.ext.clml.target_version" in pass_context.config + else 3 + ) + seq = tvm.transform.Sequential( [ transform.InferType(), @@ -631,18 +640,35 @@ def __exit__(self, ptype, value, trace): self.op.set_attr(self.attr_key, self.older_attr) +@register_func("runtime.ext.debug.clml") +def process_debug(op, dump, dump_path): + """Dump the required debug information in given path""" + dump_json = json.loads(dump) + + graph_json = json.loads(dump_json["graph"]) + with open(os.path.join(dump_path, op + ".json"), "w") as outfile: + json.dump(graph_json, outfile, indent=4, sort_keys=False) + + hex_tensors = dump_json["tensors"] + fload = tvm._ffi.get_global_func("runtime.LoadParams") + tensor_map = fload(bytearray.fromhex(hex_tensors)) + np_tensors = {} + for key, val in tensor_map.items(): + np_tensors[key] = val.asnumpy() + np.savez(os.path.join(dump_path, op + ".npz"), **np_tensors) + + class CLMLGetSubModuleSrc: """Generates CLML API one CLML sub module out ot global TVM module""" - def __init__(self, cmod): + def __init__(self, codegen): """Initialize Parameters ---------- - cmod : Module - The CLML sub module from TVM module + codegen : JSON + The CLML sub module as JSON """ - self.cmod = cmod - self.codegen = None + self.codegen = codegen self.nodes = None self.node_map = {} self.input_meta = [] @@ -833,7 +859,6 @@ def __init__(self, cmod): def get_src(self): """Returns pair of sub module name and the generated source""" - self.codegen = json.loads(self.cmod.get_source("json")) self.sub_module_name = self.codegen["symbol"] self.nodes = self.codegen["nodes"] self.clml_code.append(self.MakeHeader.substitute(module=self.sub_module_name)) @@ -848,7 +873,7 @@ def get_tensor_from_map( dtype = str(node["attrs"]["dtype"][0][0]) if node["op"] == "input": self.clml_code.append("// Input Node") - node_out_name = self.sub_module_name + "_" + "input_" + str(node_seq) + node_out_name = node["name"] else: node_out_name = node["name"] if shape is None: @@ -1267,6 +1292,53 @@ def make_output_tensor( return (self.sub_module_name, self.clml_code) +HEADER_STR = """ + /* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + /*! + * \\file clml_models.cc + * \\brief CLML models for all subgraph in given TVM module. + */ + + // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE! + // ========================================================================= + + #include + #include + + #include + #include + #include + #include + #include + + // Project includes + #include "CL/cl.h" + #include "CL/cl_qcom_ml_ops.h" + + #include "clml_runner.h" + + using namespace tvm::runtime; +""" + + class CLMLGenSrc: """Generates CLML API source given a TVM compiled mod""" @@ -1274,8 +1346,7 @@ def __init__(self, libm): """Initialize Parameters ---------- - libm : Module - Compiled relay module + libm : Module or json codegen object """ self.libm = libm self.gen_src = [] @@ -1284,55 +1355,12 @@ def __init__(self, libm): self.codegen = None self.nodes = None - self.MakeFileHeader = Template( - """/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - - /*! - * \\file clml_models.cc - * \\brief CLML models for all subgraph in given TVM module. - */ - - // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE! - // ========================================================================= - - #include - #include - - #include - #include - #include - #include - #include - - // Project includes - #include "CL/cl.h" - #include "CL/cl_qcom_ml_ops.h" - - #include "clml_runner.h" - - using namespace tvm::runtime; - """ - ) + self.MakeFileHeader = Template(HEADER_STR) def get_clml_params(self): """Returns parameters from the TVM module""" + if not isinstance(self.libm, GraphExecutorFactoryModule): + return {} clml_params = {} if self.libm.get_lib().type_key == "const_loader": @@ -1353,14 +1381,21 @@ def get_clml_params(self): def get_artifacts(self): """Function that returns params as dict and source as list of cource code lines""" - self.clml_modules = list( - filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules) - ) self.clml_builds["file_header"] = [self.MakeFileHeader.substitute()] + if isinstance(self.libm, GraphExecutorFactoryModule): + self.clml_modules = list( + filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules) + ) - for cmod in self.clml_modules: - (sub_module_name, clml_code) = CLMLGetSubModuleSrc(cmod).get_src() + for cmod in self.clml_modules: + codegen = json.loads(cmod.get_source("json")) + (sub_module_name, clml_code) = CLMLGetSubModuleSrc(codegen).get_src() + self.clml_builds[sub_module_name] = clml_code + elif isinstance(self.libm, dict): + (sub_module_name, clml_code) = CLMLGetSubModuleSrc(self.libm).get_src() self.clml_builds[sub_module_name] = clml_code + else: + raise Exception("Don't know how to handle the input") main_code = [] main_code.append( diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index c580123b1347..d8c0075fcdc1 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -23,11 +23,15 @@ */ #include "clml_runtime.h" +#include + #ifdef TVM_GRAPH_EXECUTOR_CLML #include "clml_memory_planner.h" #include "clml_utils.h" #endif +#include + namespace tvm { namespace runtime { namespace contrib { @@ -60,23 +64,28 @@ CLMLWorkspace::CLMLWorkspace() { result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr); ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result; std::string extensions(extn_buf.data()); - LOG(WARNING) << "OpenCL Extensions:" << extensions; + LOG_CLML << "OpenCL Extensions:" << extensions; if (extensions.find("cl_qcom_ml_ops") == std::string::npos) { LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n"; return; } - is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos); - is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos); - LOG(WARNING) << "Recordable Queues Support :" << is_recordable_queue; - LOG(WARNING) << "On chip Memory Support :" << is_on_chip_memory; + if (getenv("CLML_DISABLE_RECORDABLE_QUEUE")) { + is_recordable_queue = 0; + is_on_chip_memory = 0; + } else { + is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos); + is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos); + LOG_CLML << "Recordable Queues Support :" << is_recordable_queue; + LOG_CLML << "On chip Memory Support :" << is_on_chip_memory; + } if (is_on_chip_memory) { result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM, sizeof(onchip_mem_size), &onchip_mem_size, nullptr); ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):" << result; - LOG(WARNING) << "On chip memory size:" << onchip_mem_size; + LOG_CLML << "On chip memory size:" << onchip_mem_size; } // Query and Get CLML Interface @@ -106,10 +115,6 @@ CLMLWorkspace::CLMLWorkspace() { target_minor = 0; } - // ICHECK(target_minor <= CL_QCOM_ML_OPS_H_MINOR_VERSION) - // << "CLML runtime compiled with minor version " << CL_QCOM_ML_OPS_H_MINOR_VERSION - // << " where as the target supports higher version " << target_minor; - clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor); ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported." @@ -257,6 +262,167 @@ class CLMLRuntime : public JSONRuntimeBase { } } + std::string DebugDump(void) override { + if (cws->is_recordable_queue) { + LOG(FATAL) << "Debugging over recordable queues is not supported yet. You may disable the " + "same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime."; + } + cl_command_queue queue = CLML_QUEUE; + Map dump_tensors; + std::ostringstream os; + dmlc::JSONWriter writer(&os); + writer.BeginObject(); + + writer.WriteObjectKeyValue("graph", graph_json_); + + int op_index = 0; + for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) { + int nid = it->first; + auto clml_desc = it->second.first; + auto node = it->second.second; + + if ("kernel" == node.GetOpType()) { + CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index], + this->layer_.descriptorSet, 0, nullptr, nullptr); + OPENCL_CALL(clFinish(queue)); + op_index++; + } + + // Dump tensor to CPU + std::vector shape = node.GetOpShape()[0]; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + NDArray narr = NDArray::Empty(ShapeTuple(shape), tvm_dtype, {kDLCPU, 0}); + CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data); + + // Naming convention + std::string node_name; + bool is_out = false; + for (size_t i = 0; i < outputs_.size(); ++i) { + uint32_t eid = EntryID(outputs_[i]); + is_out = (eid == nid); + } + if (is_out) { + node_name = clml_symbol + "_layer_out_" + std::to_string(nid); + } else if (("const" == node.GetOpType()) || ("input" == node.GetOpType())) { + node_name = node.GetOpName(); + } else { + node_name = node.GetOpName() + "____topo-index:" + std::to_string(nid); + } + dump_tensors.Set(node_name, narr); + } + + const PackedFunc* f = Registry::Get("runtime.SaveParams"); + if (nullptr != f) { + std::string dump_bytes = (*f)(dump_tensors); + std::ostringstream oss; + /*TODO(Siva) HEX encoding doubles the size, look for better encode that can cross the RPC. */ + for (size_t i = 0; i < dump_bytes.size(); ++i) { + oss << std::setw(2) << std::setfill('0') << std::hex << static_cast(dump_bytes[i]); + } + writer.WriteObjectKeyValue("tensors", oss.str()); + } + + writer.EndObject(); + return os.str(); + } + + void RunProfile(profiling::Profiler* prof) override { + cl_command_queue queue = CLML_QUEUE; + std::vector& evts = cws->workspace->GetEventQueue(cws->tentry->device); + std::vector cs; + std::vector devices; + devices.push_back(cws->tentry->device); + + for (size_t i = 0; i < input_nodes_.size(); ++i) { + auto nid = input_nodes_[i]; + uint32_t eid = EntryID(nid, 0); + if (nodes_[nid].GetOpType() == "input") { + // Assuming all inputs are from OpenCL + if (kDLOpenCL == data_entry_[eid]->device.device_type) { + layer_.in_placeholder[nid]->memory = static_cast( + ((cl::BufferDescriptor*)const_cast(data_entry_[eid])->data)->buffer); + cl_event cpy_evt = nullptr; + cl_event* evt = &cpy_evt; + if (cws->workspace->IsProfiling(cws->tentry->device)) { + evts.resize(evts.size() + 1); + evt = &(evts.back()); + } + std::unordered_map metrics; + std::string shape_str; + std::vector shape = nodes_[nid].GetOpShape()[0]; + DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0]; + shape_str.append(profiling::ShapeString(shape, tvm_dtype)); + metrics["Argument Shapes"] = String(shape_str); + + prof->StartCall("CopyIn", cws->tentry->device, metrics); + CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor, + layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor, + layer_.inputs[nid]->memory, 0, nullptr, evt); + prof->StopCall(); + } + } + } + + for (size_t i = 0; i < this->layer_.function.size(); ++i) { + std::unordered_map metrics; + auto node = this->layer_.op_node_map[this->layer_.function[i]].second; + std::string shape_str; + for (uint32_t j = 0; j < node.GetInputs().size(); ++j) { + const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_]; + std::vector shape = in_node.GetOpShape()[0]; + DLDataType tvm_dtype = in_node.GetOpDataType()[0]; + shape_str.append(profiling::ShapeString(shape, tvm_dtype)); + shape_str.append(", "); + } + // Assuming one output per operation + std::vector shape = node.GetOpShape()[0]; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + shape_str.append(profiling::ShapeString(shape, tvm_dtype)); + metrics["Argument Shapes"] = String(shape_str); + + // Launch call + prof->StartCall(clml_symbol + "-" + this->layer_.layer_names[i], cws->tentry->device, + metrics); + queue = CLML_QUEUE; + evts.resize(evts.size() + 1); + cl_event* evt = &(evts.back()); + CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0, + nullptr, evt); + prof->StopCall(); + } + + for (size_t i = 0; i < outputs_.size(); ++i) { + uint32_t eid = EntryID(outputs_[i]); + + // Assuming all outputs are to OpenCL + if (kDLOpenCL == data_entry_[eid]->device.device_type) { + layer_.out_placeholder[i]->memory = static_cast( + ((cl::BufferDescriptor*)const_cast(data_entry_[eid])->data)->buffer); + cl_event cpy_evt = nullptr; + cl_event* evt = &cpy_evt; + if (cws->workspace->IsProfiling(cws->tentry->device)) { + evts.resize(evts.size() + 1); + evt = &(evts.back()); + } + + std::unordered_map metrics; + std::string shape_str; + std::vector shape = nodes_[eid].GetOpShape()[0]; + DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0]; + shape_str.append(profiling::ShapeString(shape, tvm_dtype)); + metrics["Argument Shapes"] = String(shape_str); + + prof->StartCall("CopyOut", cws->tentry->device, metrics); + CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor, + layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor, + layer_.out_placeholder[i]->memory, 0, nullptr, evt); + prof->StopCall(); + } + } + + return; + } + /*! * \brief Unpack inputs and outputs and run inference on a given layer. * @@ -305,7 +471,7 @@ class CLMLRuntime : public JSONRuntimeBase { int64_t duration = 0; if (cws->is_recordable_queue) { - if (getenv("CLML_PROFILING")) { + if (cws->workspace->IsProfiling(cws->tentry->device)) { Timer t; auto f = Registry::Get(std::string("profiling.timer.opencl")); t = f->operator()(cws->tentry->device); @@ -324,7 +490,7 @@ class CLMLRuntime : public JSONRuntimeBase { } else { for (size_t i = 0; i < this->layer_.function.size(); ++i) { // Make CLML subgraphs accounted by OpenCLTimerNode. - if (getenv("CLML_PROFILING")) { + if (cws->workspace->IsProfiling(cws->tentry->device)) { Timer t; auto f = Registry::Get(std::string("profiling.timer.opencl")); t = f->operator()(cws->tentry->device); @@ -336,16 +502,16 @@ class CLMLRuntime : public JSONRuntimeBase { 0, nullptr, evt); t->Stop(); duration += t->SyncAndGetElapsedNanos(); - LOG(WARNING) << "Layer:" << this->layer_.layer_names[i] - << " Duration:" << t->SyncAndGetElapsedNanos(); + LOG_CLML << "Layer:" << this->layer_.layer_names[i] + << " Duration:" << t->SyncAndGetElapsedNanos(); } else { CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0, nullptr, nullptr); } } } - if (getenv("CLML_PROFILING")) { - LOG(WARNING) << "Total Duration for " << clml_symbol << " is:" << duration; + if (cws->workspace->IsProfiling(cws->tentry->device)) { + LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration; } for (size_t i = 0; i < outputs_.size(); ++i) { @@ -616,6 +782,8 @@ class CLMLRuntime : public JSONRuntimeBase { else LOG(FATAL) << "Unsupported op: " << op_name; this->layer_.layer_names.push_back(op_name); + // Keep map of function and Node to use in profiling + this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)}); } else if (node.GetOpType() != "const") { LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType(); } @@ -710,11 +878,11 @@ class CLMLRuntime : public JSONRuntimeBase { this->layer_.tensorMemDescs.data()); if (cws->is_tuning_run) { - LOG(WARNING) << "CLML Tunning In Progress:"; + LOG_CLML << "CLML Tunning In Progress:"; // Let the command queue recreated in profiling mode. cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true); for (size_t i = 0; i < this->layer_.function.size(); ++i) { - LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i]; + LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i]; CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet, this->layer_.tuning_cache, nullptr); } @@ -741,8 +909,8 @@ class CLMLRuntime : public JSONRuntimeBase { std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary); ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file; fs.write(&tune_str[0], tune_str.length()); - LOG(WARNING) << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size" - << tune_str.length() << " with tuning blob len " << saved_cache.size(); + LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size" + << tune_str.length() << " with tuning blob len " << saved_cache.size(); } if (cws->is_recordable_queue) { for (size_t i = 0; i < this->layer_.function.size(); ++i) { @@ -1591,6 +1759,8 @@ class CLMLRuntime : public JSONRuntimeBase { << "Please build with USE_CLML_GRAPH_EXECUTOR."; } #endif + bool CanDebug() override { return true; } + /*! CLML sub graph symbol in TVM main module */ std::string clml_symbol; }; diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h index f346ce7af696..9dfde2f7820d 100644 --- a/src/runtime/contrib/clml/clml_runtime.h +++ b/src/runtime/contrib/clml/clml_runtime.h @@ -164,8 +164,10 @@ static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45; #define DEBUG_MEMORY_ALLOC false #define DEBUG_STATS false +#define DEBUG_CLML false #define LOG_MEM LOG_IF(WARNING, DEBUG_MEMORY_ALLOC) #define LOG_STATS LOG_IF(WARNING, DEBUG_STATS) +#define LOG_CLML LOG_IF(WARNING, DEBUG_CLML) namespace tvm { namespace runtime { @@ -235,6 +237,8 @@ class CLMLThreadEntry { struct CachedLayer { /* List of all created CLML operation handles in graph */ std::vector function; + /* Map of function and original JsonNode */ + std::map> op_node_map; /* The input tensor map */ std::map> inputs; /* A place holder Tensor representing TVM NDArray as CLML Tensor */ diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h index 8eec0447a189..8e105dab7837 100644 --- a/src/runtime/contrib/json/json_runtime.h +++ b/src/runtime/contrib/json/json_runtime.h @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -69,6 +70,25 @@ class JSONRuntimeBase : public ModuleNode { /*! \brief Invoke the execution engine to inteprete a specific json runtime. */ virtual void Run() = 0; + /*! \brief Does the backend support debug & profiling */ + virtual bool CanDebug() { return false; } + + /*! + * \brief Invoke the profiler + * \param pointer to profiler + */ + virtual void RunProfile(profiling::Profiler* prof) { + LOG(FATAL) << "Not expected to be here : Profiling call w/o support ?"; + } + + /*! + * \brief Invoke the debugger + * \return External compiler specific debug blob + */ + virtual std::string DebugDump(void) { + LOG(FATAL) << "Not expected to be here : Debug dump w/o support ?"; + } + /*! * \brief Get a packed function. * \param name The name/symbol of the function. @@ -88,9 +108,32 @@ class JSONRuntimeBase : public ModuleNode { // Bind argument tensors to data entries. this->SetInputOutputBuffers(args); + // Execute the subgraph. this->Run(); }); + } else if (this->symbol_name_ + "_debug" == name) { + if (!this->CanDebug()) { + return PackedFunc(nullptr); + } + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + ICHECK(this->initialized_) << "The module has not been initialized"; + + // Bind argument tensors to data entries. + this->SetInputOutputBuffers(args); + + if (rv->IsObjectRef()) { + String purpose = *rv; + if ("debug_dump" == purpose) { + *rv = this->DebugDump(); + } + } else { + // Profile the subgraph. + profiling::Profiler* prof = static_cast(rv->value().v_handle); + this->RunProfile(prof); + } + // String vendor_prof = this->RunProfile(prof); + }); } else if ("__init_" + this->symbol_name_ == name) { // The function to initialize constant tensors. return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc index 892a13b46bb4..a9cd4d544d3b 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.cc +++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc @@ -213,6 +213,9 @@ PackedFunc GraphExecutorDebug::GetFunction(const String& name, } else if (name == "execute_node") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->ExecuteNode(args[0]); }); + } else if (name == "debug_run_ext_compiler") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->DebugRunExtCompiler(); }); } else if (name == "get_node_output") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetNodeOutput(args[0], args[1]); @@ -320,6 +323,31 @@ void GraphExecutorDebug::ExecuteNode(int node) { last_executed_node_ = end_ind; } +std::string GraphExecutorDebug::DebugRunExtCompiler(void) { + std::ostringstream os; + dmlc::JSONWriter writer(&os); + writer.BeginArray(); + for (size_t i = 0; i < op_execs_.size(); ++i) { + if (!nodes_[i].param.compiler.empty() && op_profile_execs_[i]) { + TVMRetValue rv; + rv = String("debug_dump"); + this->op_profile_execs_[i](&rv); + std::string debug_ret = rv; + + writer.BeginObject(); + writer.WriteObjectKeyValue("compiler", nodes_[i].param.compiler); + writer.WriteObjectKeyValue("op", nodes_[i].param.func_name); + writer.WriteObjectKeyValue("dump", debug_ret); + writer.EndObject(); + } else { + if (op_execs_[i]) op_execs_[i](); + } + } + writer.EndArray(); + + return os.str(); +} + void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) { ICHECK_LT(static_cast(index), op_execs_.size()); uint32_t eid = index; @@ -386,9 +414,15 @@ profiling::Report GraphExecutorDebug::Profile(Array metrics["Hash"] = Downcast(nodes_[i].param.attrs.at("hash")); } metrics["Argument Shapes"] = profiling::ShapeString(shapes); - prof.StartCall(nodes_[i].param.func_name, dev, metrics); - op_execs_[i](); - prof.StopCall(); + if (!nodes_[i].param.compiler.empty() && op_profile_execs_[i]) { + TVMRetValue rv; + rv = static_cast(&prof); + this->op_profile_execs_[i](&rv); + } else { + prof.StartCall(nodes_[i].param.func_name, dev, metrics); + op_execs_[i](); + prof.StopCall(); + } } } prof.Stop(); diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.h b/src/runtime/graph_executor/debug/graph_executor_debug.h index 382083056604..8ede2a3a5f84 100644 --- a/src/runtime/graph_executor/debug/graph_executor_debug.h +++ b/src/runtime/graph_executor/debug/graph_executor_debug.h @@ -98,6 +98,15 @@ class GraphExecutorDebug : public GraphExecutor { */ void ExecuteNode(int node); + /*! + * \brief debug external comilers if supported. + * + * This method invokes the external compilers to generate any debug trace info. + * + * \return Returns serialized debug trace information to the caller + */ + std::string DebugRunExtCompiler(void); + /*! * \brief Returns index-th output of node. * diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc index 6324da9c27ef..1b1051322c49 100644 --- a/src/runtime/graph_executor/graph_executor.cc +++ b/src/runtime/graph_executor/graph_executor.cc @@ -503,6 +503,7 @@ void GraphExecutor::SetupStorage() { void GraphExecutor::SetupOpExecs() { op_execs_.resize(this->GetNumOfNodes()); + op_profile_execs_.resize(this->GetNumOfNodes()); input_dltensors_.resize(num_node_entries()); output_dltensors_.resize(num_node_entries()); both_output_opinput_dltensors_.resize(num_node_entries()); @@ -532,7 +533,7 @@ void GraphExecutor::SetupOpExecs() { ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op"; std::shared_ptr op_args = nullptr; - std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args); + std::tie(op_execs_[nid], op_profile_execs_[nid], op_args) = CreateTVMOp(inode.param, args); for (size_t i = 0; i < inode.inputs.size(); i++) { uint32_t input_eid = this->entry_id(inode.inputs[i]); @@ -581,8 +582,9 @@ void GraphExecutor::SetupOpExecs() { } } -std::pair, std::shared_ptr> GraphExecutor::CreateTVMOp( - const TVMOpParam& param, const std::vector& args) { +std::tuple, std::function, + std::shared_ptr> +GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector& args) { std::shared_ptr arg_ptr = std::make_shared(); // setup address. arg_ptr->args = args; @@ -604,7 +606,7 @@ std::pair, std::shared_ptr> GraphEx } if (param.func_name == "__nop") { - return {[]() {}, arg_ptr}; + return {[]() {}, [](TVMRetValue* rv) {}, arg_ptr}; } else if (param.func_name == "__copy") { // Perform cross device data copy. // Directly copy data from the input to the output. @@ -614,21 +616,31 @@ std::pair, std::shared_ptr> GraphEx DLTensor* to = static_cast(arg_ptr->arg_values[1].v_handle); TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr)); }; - return {fexec, arg_ptr}; + return {fexec, [](TVMRetValue* rv) {}, arg_ptr}; } // Get compiled function from the module that contains both host and device // code. tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, true); ICHECK(pf != nullptr) << "no such function in module: " << param.func_name; - auto fexec = [arg_ptr, pf]() { TVMRetValue rv; TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), static_cast(arg_ptr->arg_values.size())); pf.CallPacked(targs, &rv); }; - return {fexec, arg_ptr}; + + pf = module_.GetFunction(param.func_name + "_debug", true); + std::function fexec_profile = nullptr; + if (pf != nullptr) { + fexec_profile = [arg_ptr, pf](TVMRetValue* rv) { + TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), + static_cast(arg_ptr->arg_values.size())); + pf.CallPacked(targs, rv); + }; + } + + return {fexec, fexec_profile, arg_ptr}; } PackedFunc GraphExecutor::GetFunction(const String& name, const ObjectPtr& sptr_to_self) { diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h index 53e2801d574e..cfdba8916baa 100644 --- a/src/runtime/graph_executor/graph_executor.h +++ b/src/runtime/graph_executor/graph_executor.h @@ -56,6 +56,7 @@ using memory::MemoryManager; /*! \brief operator attributes about tvm op */ struct TVMOpParam { std::string func_name; + std::string compiler; std::unordered_map attrs; uint32_t num_inputs; uint32_t num_outputs; @@ -272,6 +273,9 @@ class TVM_DLL GraphExecutor : public ModuleNode { if (key == "func_name") { param->func_name = value; bitmask |= 1; + } + if (key == "Compiler") { + param->compiler = value; } else if (key == "num_inputs") { param->num_inputs = strtoul(value.c_str(), nullptr, 10); bitmask |= 2; @@ -440,8 +444,8 @@ class TVM_DLL GraphExecutor : public ModuleNode { * \param args The arguments to the functor, including inputs and outputs. * \return The created executor. */ - std::pair, std::shared_ptr> CreateTVMOp( - const TVMOpParam& attrs, const std::vector& args); + std::tuple, std::function, std::shared_ptr> + CreateTVMOp(const TVMOpParam& attrs, const std::vector& args); // Get node entry index. uint32_t entry_id(uint32_t nid, uint32_t index) const { return node_row_ptr_[nid] + index; } // Get node entry index. @@ -486,6 +490,8 @@ class TVM_DLL GraphExecutor : public ModuleNode { std::vector data_alignment_; /*! \brief Operator on each node. */ std::vector> op_execs_; + /*! \brief Profilable Operator on each node. */ + std::vector> op_profile_execs_; /*! \brief Linked parameter lookup function. */ PackedFunc lookup_linked_param_; /*! \brief Module's _lookup_linked_param function, used by DefaultLookupLinkedParam. */ diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index e0abd1841b64..2e9b05edcb58 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -514,7 +514,7 @@ class OpenCLTimerNode : public TimerNode { cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear(); // Very first call of Start() leads to the recreation of // OpenCL command queue in profiling mode. This allows to run profile after inference. - recreateCommandQueue(); + cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, true); } ++count_timer_execs; // set new first idx in event queue @@ -549,7 +549,7 @@ class OpenCLTimerNode : public TimerNode { // Profiling session ends, recreate clCommandQueue in non-profiling mode // This will disable collection of cl_events in case of executing inference after profile if (count_timer_execs == 0) { - recreateCommandQueue(); + cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, false); event_start_idxs.clear(); } } @@ -565,11 +565,6 @@ class OpenCLTimerNode : public TimerNode { private: int64_t duration; Device dev_; - - void recreateCommandQueue() { - cl::OpenCLWorkspace::Global()->EnableQueueProfiling( - dev_, !cl::OpenCLWorkspace::Global()->IsProfiling(dev_)); - } }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc index 6a42d840b206..83be98556a9e 100644 --- a/src/runtime/profiling.cc +++ b/src/runtime/profiling.cc @@ -671,7 +671,7 @@ Report Profiler::Report() { rows.push_back(row); } - // the last couple of call frames are the overall times + // the last frames are the overall times double overall_time_us = 0; std::unordered_map> device_metrics; for (size_t i = 0; i < devs_.size(); i++) { @@ -776,7 +776,6 @@ Report Report::FromJSON(String json) { configuration = parse_metrics(&reader); } } - return Report(calls, device_metrics, configuration); } diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh index b0c3559bf081..a35a633e1dfd 100755 --- a/tests/scripts/setup-adreno-env.sh +++ b/tests/scripts/setup-adreno-env.sh @@ -112,7 +112,7 @@ case ${ENVIRONMENT} in adb forward tcp:$((LISTEN_PORT + 1)) tcp:$((LISTEN_PORT + 1)) adb forward tcp:$((LISTEN_PORT + 2)) tcp:$((LISTEN_PORT + 2)) adb forward tcp:$((LISTEN_PORT + 3)) tcp:$((LISTEN_PORT + 3)) - adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_PROFILING=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}" + adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}" ;; "query")