diff --git a/apps/cpp_clml/scripts/clml_codegen_json.py b/apps/cpp_clml/scripts/clml_codegen_json.py
new file mode 100644
index 000000000000..c3fbf835d8ee
--- /dev/null
+++ b/apps/cpp_clml/scripts/clml_codegen_json.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+import json
+import numpy as np
+
+import tvm
+from tvm import relay
+from tvm.driver import tvmc
+from tvm.relay.op.contrib import clml
+from tvm.contrib import utils
+from string import Template
+
+
+def main():
+    print("CLML Codegen From JSON")
+    if len(sys.argv) != 3:
+        print("Usage: python clml_codegen_json.py <json path> <outfile path>")
+        return
+
+    with open(sys.argv[1], "r") as file:
+        codegen = json.load(file)
+        (_, gen_src) = clml.CLMLGenSrc(codegen).get_artifacts()
+
+        f_src = open(sys.argv[2], "w")
+        f_src.write("\n".join(gen_src))
+        f_src.close()
+        os.popen("clang-format-15 -i " + sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/apps/cpp_clml/scripts/compare_npy.py b/apps/cpp_clml/scripts/compare_npy.py
new file mode 100644
index 000000000000..8e3c3a8b630f
--- /dev/null
+++ b/apps/cpp_clml/scripts/compare_npy.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+import numpy as np
+
+
+def main():
+    print("Compare given numpy array in npz files")
+    if len(sys.argv) != 4:
+        print("Usage: python compare_npy.py <npz file 1> <npz file 2> <np array to cpmpare>")
+        return
+
+    in1 = np.load(sys.argv[1])
+    in2 = np.load(sys.argv[2])
+
+    print(sys.argv[1] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
+    print(sys.argv[2] + "->" + sys.argv[3] + ":", in1[sys.argv[3]].shape)
+
+    np.testing.assert_allclose(in1[sys.argv[3]], in2[sys.argv[3]], rtol=1e-5, atol=1e-5)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index 785959ce8dd7..b0bd46c123b7 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -17,6 +17,7 @@
 """Graph debug runtime executes TVM debug packed functions."""
 
 import logging
+import json
 import os
 import shutil
 import struct
@@ -117,6 +118,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._run_individual_node = module["run_individual_node"]
         self._debug_get_output = module["debug_get_output"]
         self._execute_node = module["execute_node"]
+        self._debug_run_ext_compiler = module["debug_run_ext_compiler"]
         self._get_node_output = module["get_node_output"]
         self._profile = module["profile"]
         self._profile_rpc = module["profile_rpc"]
@@ -223,6 +225,14 @@ def _run_per_layer(self):
                 output_tensors.append(self._get_node_output(i, j))
         self.debug_datum.update_output_tensors(output_tensors)
 
+    def _run_external_debug(self):
+        ext_trace = self._debug_run_ext_compiler()
+        ext_json = json.loads(ext_trace)
+        for op in ext_json:
+            ext_debug = tvm.get_global_func("runtime.ext.debug." + op["compiler"], True)
+            if isinstance(ext_debug, tvm.runtime.packed_func.PackedFunc):
+                ext_debug(op["op"], op["dump"], self._dump_path)
+
     def _run_debug(
         self,
         number,
@@ -249,6 +259,9 @@ def _run_debug(
         # Get outputs.
         self._run_per_layer()
 
+        # Run external compiler debug if supported
+        self._run_external_debug()
+
     def debug_get_output(self, node, out=None):
         """Run graph up to node and get the output to out
 
diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py
index 45caf41e7e58..946afd8a0be3 100644
--- a/python/tvm/contrib/debugger/debug_result.py
+++ b/python/tvm/contrib/debugger/debug_result.py
@@ -150,6 +150,10 @@ def dump_output_tensor(self):
         self._cleanup_tensors()
         output_tensors = self.get_output_tensors()
 
+        np_tensors = {}
+        for key, val in output_tensors.items():
+            np_tensors[key] = val.asnumpy()
+        np.savez(os.path.join(self._dump_path, "output_tensors.npz"), **np_tensors)
         with open(os.path.join(self._dump_path, "output_tensors.params"), "wb") as param_f:
             param_f.write(save_tensors(output_tensors))
 
diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py
index 1394936b0a57..4c47a56147b6 100644
--- a/python/tvm/driver/tvmc/runner.py
+++ b/python/tvm/driver/tvmc/runner.py
@@ -91,6 +91,12 @@ def add_run_parser(subparsers, main_parser, json_params):  # pylint: disable=unu
         "Profiling may also have an impact on inference time, "
         "making it take longer to be generated.",
     )
+    parser.add_argument(
+        "--profile-options",
+        default="table,sort,aggregate,col_sums",
+        help="Additional options for profiling. Table dump is default"
+        "comma seperated string of table,csv,json,sort,aggregate,col_sums",
+    )
     parser.add_argument("-v", "--verbose", action="count", default=0, help="increase verbosity.")
     parser.add_argument(
         "--end-to-end",
@@ -170,6 +176,7 @@ def drive_run(args):
         repeat=args.repeat,
         number=args.number,
         profile=args.profile,
+        profile_options=args.profile_options,
         end_to_end=args.end_to_end,
     )
 
@@ -359,6 +366,7 @@ def run_module(
     repeat: int = 10,
     number: int = 10,
     profile: bool = False,
+    profile_options: str = "table,sort,aggregate,col_sums",
     end_to_end: bool = False,
 ):
     """Run a compiled graph executor module locally or remotely with
@@ -398,6 +406,8 @@ def run_module(
         Requires `benchmark` to be set to True.
     profile : bool
         Whether to profile the run with the debug executor.
+    profile_options : string
+        Additional options for profiling
     end_to_end : bool
         Whether to measure the time of memory copies as well as model
         execution. Turning this on can provide a more realistic estimate
@@ -533,7 +543,15 @@ def run_module(
                 logger.info("Running the module with profiling enabled.")
                 report = module.profile()
                 # This print is intentional
-                print(report)
+                if profile_options.find("table") != -1:
+                    is_sort = profile_options.find("sort") != -1
+                    is_aggr = profile_options.find("aggregate") != -1
+                    is_sum = profile_options.find("col_sums") != -1
+                    print(report.table(sort=is_sort, aggregate=is_aggr, col_sums=is_sum))
+                if profile_options.find("csv") != -1:
+                    print(report.csv())
+                if profile_options.find("json") != -1:
+                    print(report.json())
 
             if not benchmark or device == "micro":
                 # TODO(gromero): Fix time_evaluator() for micro targets. Once it's
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
index dace7aaab913..6ee303891cd3 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -17,6 +17,7 @@
 # pylint: disable=invalid-name, unused-argument, pointless-exception-statement.
 """CLML Library supported operators."""
 import json
+import os
 from string import Template
 import numpy as np
 import tvm
@@ -29,6 +30,7 @@
 from tvm.relay import function as _function
 from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.expr import Call, TupleGetItem, Var, Constant
+from tvm.relay.backend.executor_factory import GraphExecutorFactoryModule
 
 from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple
 from .register import register_pattern_table
@@ -159,6 +161,13 @@ def partition_for_clml(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
+    pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
+    target_version = (
+        pass_context.config["relay.ext.clml.target_version"]
+        if "relay.ext.clml.target_version" in pass_context.config
+        else 3
+    )
+
     seq = tvm.transform.Sequential(
         [
             transform.InferType(),
@@ -631,18 +640,35 @@ def __exit__(self, ptype, value, trace):
             self.op.set_attr(self.attr_key, self.older_attr)
 
 
+@register_func("runtime.ext.debug.clml")
+def process_debug(op, dump, dump_path):
+    """Dump the required debug information in given path"""
+    dump_json = json.loads(dump)
+
+    graph_json = json.loads(dump_json["graph"])
+    with open(os.path.join(dump_path, op + ".json"), "w") as outfile:
+        json.dump(graph_json, outfile, indent=4, sort_keys=False)
+
+    hex_tensors = dump_json["tensors"]
+    fload = tvm._ffi.get_global_func("runtime.LoadParams")
+    tensor_map = fload(bytearray.fromhex(hex_tensors))
+    np_tensors = {}
+    for key, val in tensor_map.items():
+        np_tensors[key] = val.asnumpy()
+    np.savez(os.path.join(dump_path, op + ".npz"), **np_tensors)
+
+
 class CLMLGetSubModuleSrc:
     """Generates CLML API one CLML sub module out ot global TVM module"""
 
-    def __init__(self, cmod):
+    def __init__(self, codegen):
         """Initialize
         Parameters
         ----------
-        cmod : Module
-            The CLML sub module from TVM module
+        codegen : JSON
+            The CLML sub module as JSON
         """
-        self.cmod = cmod
-        self.codegen = None
+        self.codegen = codegen
         self.nodes = None
         self.node_map = {}
         self.input_meta = []
@@ -833,7 +859,6 @@ def __init__(self, cmod):
     def get_src(self):
         """Returns pair of sub module name and the generated source"""
 
-        self.codegen = json.loads(self.cmod.get_source("json"))
         self.sub_module_name = self.codegen["symbol"]
         self.nodes = self.codegen["nodes"]
         self.clml_code.append(self.MakeHeader.substitute(module=self.sub_module_name))
@@ -848,7 +873,7 @@ def get_tensor_from_map(
                 dtype = str(node["attrs"]["dtype"][0][0])
                 if node["op"] == "input":
                     self.clml_code.append("// Input Node")
-                    node_out_name = self.sub_module_name + "_" + "input_" + str(node_seq)
+                    node_out_name = node["name"]
                 else:
                     node_out_name = node["name"]
                 if shape is None:
@@ -1267,6 +1292,53 @@ def make_output_tensor(
         return (self.sub_module_name, self.clml_code)
 
 
+HEADER_STR = """
+    /*
+    * Licensed to the Apache Software Foundation (ASF) under one
+    * or more contributor license agreements.  See the NOTICE file
+    * distributed with this work for additional information
+    * regarding copyright ownership.  The ASF licenses this file
+    * to you under the Apache License, Version 2.0 (the
+    * "License"); you may not use this file except in compliance
+    * with the License.  You may obtain a copy of the License at
+    *
+    *   http://www.apache.org/licenses/LICENSE-2.0
+    *
+    * Unless required by applicable law or agreed to in writing,
+    * software distributed under the License is distributed on an
+    * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    * KIND, either express or implied.  See the License for the
+    * specific language governing permissions and limitations
+    * under the License.
+    */
+
+    /*!
+     * \\file clml_models.cc
+     * \\brief CLML models for all subgraph in given TVM module.
+     */
+
+    // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE!
+    // =========================================================================
+
+    #include <iostream>
+    #include <fstream>
+
+    #include <vector>
+    #include <string>
+    #include <algorithm>
+    #include <math.h>
+    #include <list>
+
+    // Project includes
+    #include "CL/cl.h"
+    #include "CL/cl_qcom_ml_ops.h"
+
+    #include "clml_runner.h"
+
+    using namespace tvm::runtime;
+"""
+
+
 class CLMLGenSrc:
     """Generates CLML API source given a TVM compiled mod"""
 
@@ -1274,8 +1346,7 @@ def __init__(self, libm):
         """Initialize
         Parameters
         ----------
-        libm : Module
-            Compiled relay module
+        libm : Module or json codegen object
         """
         self.libm = libm
         self.gen_src = []
@@ -1284,55 +1355,12 @@ def __init__(self, libm):
         self.codegen = None
         self.nodes = None
 
-        self.MakeFileHeader = Template(
-            """/*
-        * Licensed to the Apache Software Foundation (ASF) under one
-        * or more contributor license agreements.  See the NOTICE file
-        * distributed with this work for additional information
-        * regarding copyright ownership.  The ASF licenses this file
-        * to you under the Apache License, Version 2.0 (the
-        * "License"); you may not use this file except in compliance
-        * with the License.  You may obtain a copy of the License at
-        *
-        *   http://www.apache.org/licenses/LICENSE-2.0
-        *
-        * Unless required by applicable law or agreed to in writing,
-        * software distributed under the License is distributed on an
-        * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-        * KIND, either express or implied.  See the License for the
-        * specific language governing permissions and limitations
-        * under the License.
-        */
-
-        /*!
-         * \\file clml_models.cc
-         * \\brief CLML models for all subgraph in given TVM module.
-         */
-
-        // AUTO GENERATED BY TOOL (clml_codegen.py), PLEASE DO NOT CHANGE THIS FILE!
-        // =========================================================================
-
-        #include <iostream>
-        #include <fstream>
-
-        #include <vector>
-        #include <string>
-        #include <algorithm>
-        #include <math.h>
-        #include <list>
-
-        // Project includes
-        #include "CL/cl.h"
-        #include "CL/cl_qcom_ml_ops.h"
-
-        #include "clml_runner.h"
-
-        using namespace tvm::runtime;
-        """
-        )
+        self.MakeFileHeader = Template(HEADER_STR)
 
     def get_clml_params(self):
         """Returns parameters from the TVM module"""
+        if not isinstance(self.libm, GraphExecutorFactoryModule):
+            return {}
 
         clml_params = {}
         if self.libm.get_lib().type_key == "const_loader":
@@ -1353,14 +1381,21 @@ def get_clml_params(self):
     def get_artifacts(self):
         """Function that returns params as dict and source as list of cource code lines"""
 
-        self.clml_modules = list(
-            filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules)
-        )
         self.clml_builds["file_header"] = [self.MakeFileHeader.substitute()]
+        if isinstance(self.libm, GraphExecutorFactoryModule):
+            self.clml_modules = list(
+                filter(lambda mod: mod.type_key == "clml", self.libm.get_lib().imported_modules)
+            )
 
-        for cmod in self.clml_modules:
-            (sub_module_name, clml_code) = CLMLGetSubModuleSrc(cmod).get_src()
+            for cmod in self.clml_modules:
+                codegen = json.loads(cmod.get_source("json"))
+                (sub_module_name, clml_code) = CLMLGetSubModuleSrc(codegen).get_src()
+                self.clml_builds[sub_module_name] = clml_code
+        elif isinstance(self.libm, dict):
+            (sub_module_name, clml_code) = CLMLGetSubModuleSrc(self.libm).get_src()
             self.clml_builds[sub_module_name] = clml_code
+        else:
+            raise Exception("Don't know how to handle the input")
 
         main_code = []
         main_code.append(
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
index c580123b1347..d8c0075fcdc1 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -23,11 +23,15 @@
  */
 #include "clml_runtime.h"
 
+#include <unordered_map>
+
 #ifdef TVM_GRAPH_EXECUTOR_CLML
 #include "clml_memory_planner.h"
 #include "clml_utils.h"
 #endif
 
+#include <tvm/runtime/profiling.h>
+
 namespace tvm {
 namespace runtime {
 namespace contrib {
@@ -60,23 +64,28 @@ CLMLWorkspace::CLMLWorkspace() {
   result = clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, reqd_size, extn_buf.data(), nullptr);
   ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo:" << result;
   std::string extensions(extn_buf.data());
-  LOG(WARNING) << "OpenCL Extensions:" << extensions;
+  LOG_CLML << "OpenCL Extensions:" << extensions;
 
   if (extensions.find("cl_qcom_ml_ops") == std::string::npos) {
     LOG(FATAL) << "CLML Runtime Init: Qualcomm extn not present.\n";
     return;
   }
-  is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
-  is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
-  LOG(WARNING) << "Recordable Queues Support :" << is_recordable_queue;
-  LOG(WARNING) << "On chip Memory Support :" << is_on_chip_memory;
+  if (getenv("CLML_DISABLE_RECORDABLE_QUEUE")) {
+    is_recordable_queue = 0;
+    is_on_chip_memory = 0;
+  } else {
+    is_recordable_queue = (extensions.find("cl_qcom_recordable_queues") != std::string::npos);
+    is_on_chip_memory = (extensions.find("cl_qcom_onchip_global_memory") != std::string::npos);
+    LOG_CLML << "Recordable Queues Support :" << is_recordable_queue;
+    LOG_CLML << "On chip Memory Support :" << is_on_chip_memory;
+  }
 
   if (is_on_chip_memory) {
     result = clGetDeviceInfo(device_id, CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM,
                              sizeof(onchip_mem_size), &onchip_mem_size, nullptr);
     ICHECK(result == CL_SUCCESS) << "clGetDeviceInfo(CL_DEVICE_ONCHIP_GLOBAL_MEM_SIZE_QCOM):"
                                  << result;
-    LOG(WARNING) << "On chip memory size:" << onchip_mem_size;
+    LOG_CLML << "On chip memory size:" << onchip_mem_size;
   }
 
   // Query and Get CLML Interface
@@ -106,10 +115,6 @@ CLMLWorkspace::CLMLWorkspace() {
     target_minor = 0;
   }
 
-  // ICHECK(target_minor <= CL_QCOM_ML_OPS_H_MINOR_VERSION)
-  //    << "CLML runtime compiled with minor version " << CL_QCOM_ML_OPS_H_MINOR_VERSION
-  //    << " where as the target supports higher version " << target_minor;
-
   clGetMLInterfaceQCOM(&h_ClmlIntf, target_major, target_minor);
 
   ICHECK(nullptr != h_ClmlIntf) << "Couldn't get API interface, target is not supported."
@@ -257,6 +262,167 @@ class CLMLRuntime : public JSONRuntimeBase {
     }
   }
 
+  std::string DebugDump(void) override {
+    if (cws->is_recordable_queue) {
+      LOG(FATAL) << "Debugging over recordable queues is not supported yet. You may disable the "
+                    "same by exporting CLML_DISABLE_RECORDABLE_QUEUE at runtime.";
+    }
+    cl_command_queue queue = CLML_QUEUE;
+    Map<String, NDArray> dump_tensors;
+    std::ostringstream os;
+    dmlc::JSONWriter writer(&os);
+    writer.BeginObject();
+
+    writer.WriteObjectKeyValue("graph", graph_json_);
+
+    int op_index = 0;
+    for (auto it = this->layer_.storage_map.begin(); it != this->layer_.storage_map.end(); it++) {
+      int nid = it->first;
+      auto clml_desc = it->second.first;
+      auto node = it->second.second;
+
+      if ("kernel" == node.GetOpType()) {
+        CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[op_index],
+                  this->layer_.descriptorSet, 0, nullptr, nullptr);
+        OPENCL_CALL(clFinish(queue));
+        op_index++;
+      }
+
+      // Dump tensor to CPU
+      std::vector<int64_t> shape = node.GetOpShape()[0];
+      DLDataType tvm_dtype = node.GetOpDataType()[0];
+      NDArray narr = NDArray::Empty(ShapeTuple(shape), tvm_dtype, {kDLCPU, 0});
+      CopyDataFromCLMLTensor(clml_desc, narr.operator->()->data);
+
+      // Naming convention
+      std::string node_name;
+      bool is_out = false;
+      for (size_t i = 0; i < outputs_.size(); ++i) {
+        uint32_t eid = EntryID(outputs_[i]);
+        is_out = (eid == nid);
+      }
+      if (is_out) {
+        node_name = clml_symbol + "_layer_out_" + std::to_string(nid);
+      } else if (("const" == node.GetOpType()) || ("input" == node.GetOpType())) {
+        node_name = node.GetOpName();
+      } else {
+        node_name = node.GetOpName() + "____topo-index:" + std::to_string(nid);
+      }
+      dump_tensors.Set(node_name, narr);
+    }
+
+    const PackedFunc* f = Registry::Get("runtime.SaveParams");
+    if (nullptr != f) {
+      std::string dump_bytes = (*f)(dump_tensors);
+      std::ostringstream oss;
+      /*TODO(Siva) HEX encoding doubles the size, look for better encode that can cross the RPC. */
+      for (size_t i = 0; i < dump_bytes.size(); ++i) {
+        oss << std::setw(2) << std::setfill('0') << std::hex << static_cast<int>(dump_bytes[i]);
+      }
+      writer.WriteObjectKeyValue("tensors", oss.str());
+    }
+
+    writer.EndObject();
+    return os.str();
+  }
+
+  void RunProfile(profiling::Profiler* prof) override {
+    cl_command_queue queue = CLML_QUEUE;
+    std::vector<cl_event>& evts = cws->workspace->GetEventQueue(cws->tentry->device);
+    std::vector<profiling::MetricCollector> cs;
+    std::vector<Device> devices;
+    devices.push_back(cws->tentry->device);
+
+    for (size_t i = 0; i < input_nodes_.size(); ++i) {
+      auto nid = input_nodes_[i];
+      uint32_t eid = EntryID(nid, 0);
+      if (nodes_[nid].GetOpType() == "input") {
+        // Assuming all inputs are from OpenCL
+        if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+          layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
+              ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+          cl_event cpy_evt = nullptr;
+          cl_event* evt = &cpy_evt;
+          if (cws->workspace->IsProfiling(cws->tentry->device)) {
+            evts.resize(evts.size() + 1);
+            evt = &(evts.back());
+          }
+          std::unordered_map<std::string, ObjectRef> metrics;
+          std::string shape_str;
+          std::vector<int64_t> shape = nodes_[nid].GetOpShape()[0];
+          DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
+          shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+          metrics["Argument Shapes"] = String(shape_str);
+
+          prof->StartCall("CopyIn", cws->tentry->device, metrics);
+          CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.in_placeholder[nid]->tensor,
+                    layer_.in_placeholder[nid]->memory, layer_.inputs[nid]->tensor,
+                    layer_.inputs[nid]->memory, 0, nullptr, evt);
+          prof->StopCall();
+        }
+      }
+    }
+
+    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
+      std::unordered_map<std::string, ObjectRef> metrics;
+      auto node = this->layer_.op_node_map[this->layer_.function[i]].second;
+      std::string shape_str;
+      for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
+        const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
+        std::vector<int64_t> shape = in_node.GetOpShape()[0];
+        DLDataType tvm_dtype = in_node.GetOpDataType()[0];
+        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+        shape_str.append(", ");
+      }
+      // Assuming one output per operation
+      std::vector<int64_t> shape = node.GetOpShape()[0];
+      DLDataType tvm_dtype = node.GetOpDataType()[0];
+      shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+      metrics["Argument Shapes"] = String(shape_str);
+
+      // Launch call
+      prof->StartCall(clml_symbol + "-" + this->layer_.layer_names[i], cws->tentry->device,
+                      metrics);
+      queue = CLML_QUEUE;
+      evts.resize(evts.size() + 1);
+      cl_event* evt = &(evts.back());
+      CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet, 0,
+                nullptr, evt);
+      prof->StopCall();
+    }
+
+    for (size_t i = 0; i < outputs_.size(); ++i) {
+      uint32_t eid = EntryID(outputs_[i]);
+
+      // Assuming all outputs are to OpenCL
+      if (kDLOpenCL == data_entry_[eid]->device.device_type) {
+        layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
+            ((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
+        cl_event cpy_evt = nullptr;
+        cl_event* evt = &cpy_evt;
+        if (cws->workspace->IsProfiling(cws->tentry->device)) {
+          evts.resize(evts.size() + 1);
+          evt = &(evts.back());
+        }
+
+        std::unordered_map<std::string, ObjectRef> metrics;
+        std::string shape_str;
+        std::vector<int64_t> shape = nodes_[eid].GetOpShape()[0];
+        DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
+        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
+        metrics["Argument Shapes"] = String(shape_str);
+
+        prof->StartCall("CopyOut", cws->tentry->device, metrics);
+        CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, layer_.outputs[i]->tensor,
+                  layer_.outputs[i]->memory, layer_.out_placeholder[i]->tensor,
+                  layer_.out_placeholder[i]->memory, 0, nullptr, evt);
+        prof->StopCall();
+      }
+    }
+
+    return;
+  }
+
   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
@@ -305,7 +471,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 
     int64_t duration = 0;
     if (cws->is_recordable_queue) {
-      if (getenv("CLML_PROFILING")) {
+      if (cws->workspace->IsProfiling(cws->tentry->device)) {
         Timer t;
         auto f = Registry::Get(std::string("profiling.timer.opencl"));
         t = f->operator()(cws->tentry->device);
@@ -324,7 +490,7 @@ class CLMLRuntime : public JSONRuntimeBase {
     } else {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
         // Make CLML subgraphs accounted by OpenCLTimerNode.
-        if (getenv("CLML_PROFILING")) {
+        if (cws->workspace->IsProfiling(cws->tentry->device)) {
           Timer t;
           auto f = Registry::Get(std::string("profiling.timer.opencl"));
           t = f->operator()(cws->tentry->device);
@@ -336,16 +502,16 @@ class CLMLRuntime : public JSONRuntimeBase {
                     0, nullptr, evt);
           t->Stop();
           duration += t->SyncAndGetElapsedNanos();
-          LOG(WARNING) << "Layer:" << this->layer_.layer_names[i]
-                       << " Duration:" << t->SyncAndGetElapsedNanos();
+          LOG_CLML << "Layer:" << this->layer_.layer_names[i]
+                   << " Duration:" << t->SyncAndGetElapsedNanos();
         } else {
           CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i], this->layer_.descriptorSet,
                     0, nullptr, nullptr);
         }
       }
     }
-    if (getenv("CLML_PROFILING")) {
-      LOG(WARNING) << "Total Duration for " << clml_symbol << " is:" << duration;
+    if (cws->workspace->IsProfiling(cws->tentry->device)) {
+      LOG_CLML << "Total Duration for " << clml_symbol << " is:" << duration;
     }
 
     for (size_t i = 0; i < outputs_.size(); ++i) {
@@ -616,6 +782,8 @@ class CLMLRuntime : public JSONRuntimeBase {
         else
           LOG(FATAL) << "Unsupported op: " << op_name;
         this->layer_.layer_names.push_back(op_name);
+        // Keep map of function and Node to use in profiling
+        this->layer_.op_node_map.insert({this->layer_.function.back(), std::make_pair(nid, node)});
       } else if (node.GetOpType() != "const") {
         LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType();
       }
@@ -710,11 +878,11 @@ class CLMLRuntime : public JSONRuntimeBase {
               this->layer_.tensorMemDescs.data());
 
     if (cws->is_tuning_run) {
-      LOG(WARNING) << "CLML Tunning In Progress:";
+      LOG_CLML << "CLML Tunning In Progress:";
       // Let the command queue recreated in profiling mode.
       cl::OpenCLWorkspace::Global()->EnableQueueProfiling(cws->tentry->device, true);
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-        LOG(WARNING) << "CLML Tunning:" << this->layer_.layer_names[i];
+        LOG_CLML << "CLML Tunning:" << this->layer_.layer_names[i];
         CLML_CALL(clTuneMLOpQCOM, CLML_QUEUE, this->layer_.function[i], this->layer_.descriptorSet,
                   this->layer_.tuning_cache, nullptr);
       }
@@ -741,8 +909,8 @@ class CLMLRuntime : public JSONRuntimeBase {
       std::ofstream fs(cws->tuning_file, std::ios::app | std::ios::binary);
       ICHECK(!fs.fail()) << "Cannot open " << cws->tuning_file;
       fs.write(&tune_str[0], tune_str.length());
-      LOG(WARNING) << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
-                   << tune_str.length() << " with tuning blob len " << saved_cache.size();
+      LOG_CLML << "CLML: Tuning cache dumped to:" << cws->tuning_file << " size"
+               << tune_str.length() << " with tuning blob len " << saved_cache.size();
     }
     if (cws->is_recordable_queue) {
       for (size_t i = 0; i < this->layer_.function.size(); ++i) {
@@ -1591,6 +1759,8 @@ class CLMLRuntime : public JSONRuntimeBase {
                  << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }
 #endif
+  bool CanDebug() override { return true; }
+
   /*! CLML sub graph symbol in TVM main module */
   std::string clml_symbol;
 };
diff --git a/src/runtime/contrib/clml/clml_runtime.h b/src/runtime/contrib/clml/clml_runtime.h
index f346ce7af696..9dfde2f7820d 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -164,8 +164,10 @@ static const uint64_t kTVMCLMLTuningCacheMagic = 0x434C4D4C54554E45;
 
 #define DEBUG_MEMORY_ALLOC false
 #define DEBUG_STATS false
+#define DEBUG_CLML false
 #define LOG_MEM LOG_IF(WARNING, DEBUG_MEMORY_ALLOC)
 #define LOG_STATS LOG_IF(WARNING, DEBUG_STATS)
+#define LOG_CLML LOG_IF(WARNING, DEBUG_CLML)
 
 namespace tvm {
 namespace runtime {
@@ -235,6 +237,8 @@ class CLMLThreadEntry {
 struct CachedLayer {
   /* List of all created CLML operation handles in graph */
   std::vector<cl_ml_op_qcom> function;
+  /* Map of function and original JsonNode */
+  std::map<cl_ml_op_qcom, std::pair<int, JSONGraphNode>> op_node_map;
   /* The input tensor map  */
   std::map<int, std::shared_ptr<cl_ml_tensor_memory_desc_qcom>> inputs;
   /* A place holder Tensor representing TVM NDArray as CLML Tensor */
diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h
index 8eec0447a189..8e105dab7837 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -27,6 +27,7 @@
 
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/profiling.h>
 
 #include <cstddef>
 #include <string>
@@ -69,6 +70,25 @@ class JSONRuntimeBase : public ModuleNode {
   /*! \brief Invoke the execution engine to inteprete a specific json runtime. */
   virtual void Run() = 0;
 
+  /*! \brief Does the backend support debug & profiling */
+  virtual bool CanDebug() { return false; }
+
+  /*!
+   * \brief Invoke the profiler
+   * \param pointer to profiler
+   */
+  virtual void RunProfile(profiling::Profiler* prof) {
+    LOG(FATAL) << "Not expected to be here : Profiling call w/o support ?";
+  }
+
+  /*!
+   * \brief Invoke the debugger
+   * \return External compiler specific debug blob
+   */
+  virtual std::string DebugDump(void) {
+    LOG(FATAL) << "Not expected to be here : Debug dump w/o support ?";
+  }
+
   /*!
    * \brief Get a packed function.
    * \param name The name/symbol of the function.
@@ -88,9 +108,32 @@ class JSONRuntimeBase : public ModuleNode {
 
         // Bind argument tensors to data entries.
         this->SetInputOutputBuffers(args);
+
         // Execute the subgraph.
         this->Run();
       });
+    } else if (this->symbol_name_ + "_debug" == name) {
+      if (!this->CanDebug()) {
+        return PackedFunc(nullptr);
+      }
+      return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        ICHECK(this->initialized_) << "The module has not been initialized";
+
+        // Bind argument tensors to data entries.
+        this->SetInputOutputBuffers(args);
+
+        if (rv->IsObjectRef<String>()) {
+          String purpose = *rv;
+          if ("debug_dump" == purpose) {
+            *rv = this->DebugDump();
+          }
+        } else {
+          // Profile the subgraph.
+          profiling::Profiler* prof = static_cast<profiling::Profiler*>(rv->value().v_handle);
+          this->RunProfile(prof);
+        }
+        // String vendor_prof = this->RunProfile(prof);
+      });
     } else if ("__init_" + this->symbol_name_ == name) {
       // The function to initialize constant tensors.
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 892a13b46bb4..a9cd4d544d3b 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -213,6 +213,9 @@ PackedFunc GraphExecutorDebug::GetFunction(const String& name,
   } else if (name == "execute_node") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->ExecuteNode(args[0]); });
+  } else if (name == "debug_run_ext_compiler") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->DebugRunExtCompiler(); });
   } else if (name == "get_node_output") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       *rv = this->GetNodeOutput(args[0], args[1]);
@@ -320,6 +323,31 @@ void GraphExecutorDebug::ExecuteNode(int node) {
   last_executed_node_ = end_ind;
 }
 
+std::string GraphExecutorDebug::DebugRunExtCompiler(void) {
+  std::ostringstream os;
+  dmlc::JSONWriter writer(&os);
+  writer.BeginArray();
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (!nodes_[i].param.compiler.empty() && op_profile_execs_[i]) {
+      TVMRetValue rv;
+      rv = String("debug_dump");
+      this->op_profile_execs_[i](&rv);
+      std::string debug_ret = rv;
+
+      writer.BeginObject();
+      writer.WriteObjectKeyValue("compiler", nodes_[i].param.compiler);
+      writer.WriteObjectKeyValue("op", nodes_[i].param.func_name);
+      writer.WriteObjectKeyValue("dump", debug_ret);
+      writer.EndObject();
+    } else {
+      if (op_execs_[i]) op_execs_[i]();
+    }
+  }
+  writer.EndArray();
+
+  return os.str();
+}
+
 void GraphExecutorDebug::DebugGetNodeOutput(int index, DLTensor* data_out) {
   ICHECK_LT(static_cast<size_t>(index), op_execs_.size());
   uint32_t eid = index;
@@ -386,9 +414,15 @@ profiling::Report GraphExecutorDebug::Profile(Array<profiling::MetricCollector>
         metrics["Hash"] = Downcast<String>(nodes_[i].param.attrs.at("hash"));
       }
       metrics["Argument Shapes"] = profiling::ShapeString(shapes);
-      prof.StartCall(nodes_[i].param.func_name, dev, metrics);
-      op_execs_[i]();
-      prof.StopCall();
+      if (!nodes_[i].param.compiler.empty() && op_profile_execs_[i]) {
+        TVMRetValue rv;
+        rv = static_cast<void*>(&prof);
+        this->op_profile_execs_[i](&rv);
+      } else {
+        prof.StartCall(nodes_[i].param.func_name, dev, metrics);
+        op_execs_[i]();
+        prof.StopCall();
+      }
     }
   }
   prof.Stop();
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.h b/src/runtime/graph_executor/debug/graph_executor_debug.h
index 382083056604..8ede2a3a5f84 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.h
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.h
@@ -98,6 +98,15 @@ class GraphExecutorDebug : public GraphExecutor {
    */
   void ExecuteNode(int node);
 
+  /*!
+   * \brief debug external comilers if supported.
+   *
+   * This method invokes the external compilers to generate any debug trace info.
+   *
+   * \return Returns serialized debug trace information to the caller
+   */
+  std::string DebugRunExtCompiler(void);
+
   /*!
    * \brief Returns index-th output of node.
    *
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
index 6324da9c27ef..1b1051322c49 100644
--- a/src/runtime/graph_executor/graph_executor.cc
+++ b/src/runtime/graph_executor/graph_executor.cc
@@ -503,6 +503,7 @@ void GraphExecutor::SetupStorage() {
 
 void GraphExecutor::SetupOpExecs() {
   op_execs_.resize(this->GetNumOfNodes());
+  op_profile_execs_.resize(this->GetNumOfNodes());
   input_dltensors_.resize(num_node_entries());
   output_dltensors_.resize(num_node_entries());
   both_output_opinput_dltensors_.resize(num_node_entries());
@@ -532,7 +533,7 @@ void GraphExecutor::SetupOpExecs() {
     ICHECK(inode.op_type == "tvm_op") << "Can only take tvm_op as op";
 
     std::shared_ptr<OpArgs> op_args = nullptr;
-    std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args);
+    std::tie(op_execs_[nid], op_profile_execs_[nid], op_args) = CreateTVMOp(inode.param, args);
 
     for (size_t i = 0; i < inode.inputs.size(); i++) {
       uint32_t input_eid = this->entry_id(inode.inputs[i]);
@@ -581,8 +582,9 @@ void GraphExecutor::SetupOpExecs() {
   }
 }
 
-std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphExecutor::CreateTVMOp(
-    const TVMOpParam& param, const std::vector<DLTensor*>& args) {
+std::tuple<std::function<void()>, std::function<void(TVMRetValue*)>,
+           std::shared_ptr<GraphExecutor::OpArgs>>
+GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor*>& args) {
   std::shared_ptr<GraphExecutor::OpArgs> arg_ptr = std::make_shared<GraphExecutor::OpArgs>();
   // setup address.
   arg_ptr->args = args;
@@ -604,7 +606,7 @@ std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphEx
   }
 
   if (param.func_name == "__nop") {
-    return {[]() {}, arg_ptr};
+    return {[]() {}, [](TVMRetValue* rv) {}, arg_ptr};
   } else if (param.func_name == "__copy") {
     // Perform cross device data copy.
     // Directly copy data from the input to the output.
@@ -614,21 +616,31 @@ std::pair<std::function<void()>, std::shared_ptr<GraphExecutor::OpArgs>> GraphEx
       DLTensor* to = static_cast<DLTensor*>(arg_ptr->arg_values[1].v_handle);
       TVM_CCALL(TVMArrayCopyFromTo(from, to, nullptr));
     };
-    return {fexec, arg_ptr};
+    return {fexec, [](TVMRetValue* rv) {}, arg_ptr};
   }
 
   // Get compiled function from the module that contains both host and device
   // code.
   tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, true);
   ICHECK(pf != nullptr) << "no such function in module: " << param.func_name;
-
   auto fexec = [arg_ptr, pf]() {
     TVMRetValue rv;
     TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(),
                   static_cast<int>(arg_ptr->arg_values.size()));
     pf.CallPacked(targs, &rv);
   };
-  return {fexec, arg_ptr};
+
+  pf = module_.GetFunction(param.func_name + "_debug", true);
+  std::function<void(TVMRetValue*)> fexec_profile = nullptr;
+  if (pf != nullptr) {
+    fexec_profile = [arg_ptr, pf](TVMRetValue* rv) {
+      TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(),
+                    static_cast<int>(arg_ptr->arg_values.size()));
+      pf.CallPacked(targs, rv);
+    };
+  }
+
+  return {fexec, fexec_profile, arg_ptr};
 }
 
 PackedFunc GraphExecutor::GetFunction(const String& name, const ObjectPtr<Object>& sptr_to_self) {
diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
index 53e2801d574e..cfdba8916baa 100644
--- a/src/runtime/graph_executor/graph_executor.h
+++ b/src/runtime/graph_executor/graph_executor.h
@@ -56,6 +56,7 @@ using memory::MemoryManager;
 /*! \brief operator attributes about tvm op */
 struct TVMOpParam {
   std::string func_name;
+  std::string compiler;
   std::unordered_map<std::string, ObjectRef> attrs;
   uint32_t num_inputs;
   uint32_t num_outputs;
@@ -272,6 +273,9 @@ class TVM_DLL GraphExecutor : public ModuleNode {
         if (key == "func_name") {
           param->func_name = value;
           bitmask |= 1;
+        }
+        if (key == "Compiler") {
+          param->compiler = value;
         } else if (key == "num_inputs") {
           param->num_inputs = strtoul(value.c_str(), nullptr, 10);
           bitmask |= 2;
@@ -440,8 +444,8 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    * \param args The arguments to the functor, including inputs and outputs.
    * \return The created executor.
    */
-  std::pair<std::function<void()>, std::shared_ptr<OpArgs>> CreateTVMOp(
-      const TVMOpParam& attrs, const std::vector<DLTensor*>& args);
+  std::tuple<std::function<void()>, std::function<void(TVMRetValue*)>, std::shared_ptr<OpArgs>>
+  CreateTVMOp(const TVMOpParam& attrs, const std::vector<DLTensor*>& args);
   // Get node entry index.
   uint32_t entry_id(uint32_t nid, uint32_t index) const { return node_row_ptr_[nid] + index; }
   // Get node entry index.
@@ -486,6 +490,8 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   std::vector<size_t> data_alignment_;
   /*! \brief Operator on each node. */
   std::vector<std::function<void()>> op_execs_;
+  /*! \brief Profilable Operator on each node. */
+  std::vector<std::function<void(TVMRetValue*)>> op_profile_execs_;
   /*! \brief Linked parameter lookup function. */
   PackedFunc lookup_linked_param_;
   /*! \brief Module's _lookup_linked_param function, used by DefaultLookupLinkedParam. */
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index e0abd1841b64..2e9b05edcb58 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -514,7 +514,7 @@ class OpenCLTimerNode : public TimerNode {
       cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear();
       // Very first call of Start() leads to the recreation of
       // OpenCL command queue in profiling mode. This allows to run profile after inference.
-      recreateCommandQueue();
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, true);
     }
     ++count_timer_execs;
     // set new first idx in event queue
@@ -549,7 +549,7 @@ class OpenCLTimerNode : public TimerNode {
     // Profiling session ends, recreate clCommandQueue in non-profiling mode
     // This will disable collection of cl_events in case of executing inference after profile
     if (count_timer_execs == 0) {
-      recreateCommandQueue();
+      cl::OpenCLWorkspace::Global()->EnableQueueProfiling(dev_, false);
       event_start_idxs.clear();
     }
   }
@@ -565,11 +565,6 @@ class OpenCLTimerNode : public TimerNode {
  private:
   int64_t duration;
   Device dev_;
-
-  void recreateCommandQueue() {
-    cl::OpenCLWorkspace::Global()->EnableQueueProfiling(
-        dev_, !cl::OpenCLWorkspace::Global()->IsProfiling(dev_));
-  }
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 6a42d840b206..83be98556a9e 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -671,7 +671,7 @@ Report Profiler::Report() {
     rows.push_back(row);
   }
 
-  // the last couple of call frames are the overall times
+  // the last frames are the overall times
   double overall_time_us = 0;
   std::unordered_map<String, Map<String, ObjectRef>> device_metrics;
   for (size_t i = 0; i < devs_.size(); i++) {
@@ -776,7 +776,6 @@ Report Report::FromJSON(String json) {
       configuration = parse_metrics(&reader);
     }
   }
-
   return Report(calls, device_metrics, configuration);
 }
 
diff --git a/tests/scripts/setup-adreno-env.sh b/tests/scripts/setup-adreno-env.sh
index b0c3559bf081..a35a633e1dfd 100755
--- a/tests/scripts/setup-adreno-env.sh
+++ b/tests/scripts/setup-adreno-env.sh
@@ -112,7 +112,7 @@ case ${ENVIRONMENT} in
     adb forward tcp:$((LISTEN_PORT + 1)) tcp:$((LISTEN_PORT + 1))
     adb forward tcp:$((LISTEN_PORT + 2)) tcp:$((LISTEN_PORT + 2))
     adb forward tcp:$((LISTEN_PORT + 3)) tcp:$((LISTEN_PORT + 3))
-    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_PROFILING=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
+    adb shell "cd ${TARGET_FOLDER}; killall -9 tvm_rpc-${USER}; sleep 2; export CLML_DISABLE_RECORDABLE_QUEUE=1; export CLML_IS_TUNING_RUN=1; export CLML_TUNING_CACHE=clml.bin; LD_LIBRARY_PATH=${TARGET_FOLDER}/ ./tvm_rpc-${USER} server --host=0.0.0.0 --port=${LISTEN_PORT} --port-end=$((LISTEN_PORT + 10)) --tracker=127.0.0.1:${TVM_TRACKER_PORT} --key=${RPC_DEVICE_KEY}"
     ;;
 
   "query")