diff --git a/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp b/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp
index 58ff7c3c81..e0e7296b73 100644
--- a/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp
+++ b/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp
@@ -9,6 +9,7 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_hw_context.h"
 #include "xrt/xrt_kernel.h"
+#include <experimental/xrt_kernel.h>
 namespace XBU = XBUtilities;
 
 #include <filesystem>
@@ -80,23 +81,35 @@ TestNPULatency::run(std::shared_ptr<xrt_core::device> dev)
     return ptree;
   }
 
-  //Create BOs, the values are not initialized as they are not really used by this special test running on the device
-  int argno = 1;
-  xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++));
-  argno++;
-  xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  //Create ctrlcode with NOPs
-  std::memset(bo_instr.map<char*>(), 0, buffer_size);
-
-  //Sync BOs
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  xrt::xclbin::ip cu;
+  for (const auto& ip : xclbin.get_ips()) {
+    if (ip.get_type() != xrt::xclbin::ip::ip_type::ps)
+      continue;
+    cu = ip;
+    break;
+  }
+
+  // create specified a run and populate with arguments
+  auto run = xrt::run(testker);
+  for (const auto& arg : cu.get_args()) {
+    auto arg_idx = static_cast<int>(arg.get_index());
+    if (arg.get_host_type() == "uint64_t")
+      run.set_arg(arg_idx, static_cast<uint64_t>(1));
+    else if (arg.get_host_type() == "uint32_t")
+	    run.set_arg(arg_idx, static_cast<uint32_t>(1));
+    else if (arg.get_host_type().find('*') != std::string::npos) {
+      xrt::bo bo;
+
+      if (arg.get_name() == "instruct")
+        bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx));
+      else 
+        bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx));
+
+      bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+	    run.set_arg(arg_idx, bo);
+    }
+  } 
+
   //Log
   if(XBU::getVerbose()) {
     logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size));
@@ -105,17 +118,16 @@ TestNPULatency::run(std::shared_ptr<xrt_core::device> dev)
 
   // Run the test to compute latency where we submit one job at a time and wait for its completion before
   // we submit the next one
-  float elapsedSecs = 0.0;
+  float elapsed_secs = 0.0;
 
   try {
     auto start = std::chrono::high_resolution_clock::now();
     for (int i = 0; i < itr_count; i++) {
-      auto hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc);
-      // Wait for kernel to be done
-      hand.wait2();
+      run.start();
+      run.wait2();
     }
     auto end = std::chrono::high_resolution_clock::now();
-    elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
+    elapsed_secs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
   }
   catch (const std::exception& ex) {
     logger(ptree, "Error", ex.what());
@@ -123,7 +135,7 @@ TestNPULatency::run(std::shared_ptr<xrt_core::device> dev)
   }
 
   // Calculate end-to-end latency of one job execution
-  const float latency = (elapsedSecs / itr_count) * 1000000; //convert s to us
+  const float latency = (elapsed_secs / itr_count) * 1000000; //convert s to us
   logger(ptree, "Details", boost::str(boost::format("Average latency: '%.1f' us") % latency));
   ptree.put("status", test_token_passed);
   return ptree;
diff --git a/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp b/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp
index ceffe31986..cb7394f6f9 100644
--- a/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp
+++ b/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp
@@ -9,13 +9,15 @@
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_hw_context.h"
 #include "xrt/xrt_kernel.h"
+#include <experimental/xrt_kernel.h>
 namespace XBU = XBUtilities;
 
 #include <filesystem>
 
 static constexpr size_t host_app = 1; //opcode
 static constexpr size_t buffer_size = 20;
-static constexpr int itr_count_throughput = 2500;
+static constexpr int run_buffer = 9;
+static constexpr int itr_count_throughput = 2502;
 // ----- C L A S S   M E T H O D S -------------------------------------------
 TestNPUThroughput::TestNPUThroughput()
   : TestRunner("throughput", "Run end-to-end throughput test")
@@ -79,23 +81,41 @@ TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
     return ptree;
   }
 
-  //Create BOs, the values are not initialized as they are not really used by this special test running on the device
-  int argno = 1;
-  xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++));
-  argno++;
-  xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++));
-  //Create ctrlcode with NOPs
-  std::memset(bo_instr.map<char*>(), 0, buffer_size);
-
-  //Sync BOs
-  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-  bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  xrt::xclbin::ip cu;
+  for (const auto& ip : xclbin.get_ips()) {
+    if (ip.get_type() != xrt::xclbin::ip::ip_type::ps)
+      continue;
+    cu = ip;
+    break;
+  }
+
+  // create specified number of runs and populate with arguments
+  std::vector<xrt::bo> global_args;
+  std::vector<xrt::run> run_handles;
+
+  for (int i=0; i < run_buffer; ++i) {
+    auto run = xrt::run(testker);
+    for (const auto& arg : cu.get_args()) {
+      auto arg_idx = static_cast<int>(arg.get_index());
+      if (arg.get_host_type() == "uint64_t")
+	      run.set_arg(arg_idx, static_cast<uint64_t>(1));
+      else if (arg.get_host_type() == "uint32_t")
+	      run.set_arg(arg_idx, static_cast<uint32_t>(1));
+      else if (arg.get_host_type().find('*') != std::string::npos) {
+        xrt::bo bo;
+
+        if (arg.get_name() == "instruct")
+          bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx));
+        else 
+          bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx));
+
+      bo.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+	    global_args.push_back(bo);
+	    run.set_arg(arg_idx, bo);
+      }
+    }
+    run_handles.push_back(std::move(run));
+  }
 
   //Log
   if(XBU::getVerbose()) {
@@ -106,14 +126,18 @@ TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
   // Run the test to compute throughput where we saturate NPU with jobs and then wait for all
   // completions at the end
   float elapsedSecs = 0.0;
-  std::array<xrt::run, itr_count_throughput> runhandles;
 
   try {
     auto start = std::chrono::high_resolution_clock::now();
-    for (auto & hand : runhandles)
-      hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc);
-    for (const auto& hand: runhandles)
-      hand.wait2();
+    //enqueue 9 commnds
+    for(int i = 0; i < run_buffer; i++) {
+      run_handles[i%run_buffer].start();
+    }
+    //wait for each command to finish and add them to the queue
+    for(int i = 0; i < (itr_count_throughput-run_buffer); i++) {
+      run_handles[i%run_buffer].wait2();
+      run_handles[i%run_buffer].start();
+    }
     auto end = std::chrono::high_resolution_clock::now();
     elapsedSecs = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
   }
@@ -123,7 +147,7 @@ TestNPUThroughput::run(std::shared_ptr<xrt_core::device> dev)
   }
 
   // Compute the throughput
-  const double throughput = (elapsedSecs != 0.0) ? runhandles.size() / elapsedSecs : 0.0;
+  const double throughput = (elapsedSecs != 0.0) ? itr_count_throughput / elapsedSecs : 0.0;
 
   logger(ptree, "Details", boost::str(boost::format("Average throughput: '%.1f' ops") % throughput));
   ptree.put("status", test_token_passed);