diff --git a/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp b/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp index 58ff7c3c81..e0e7296b73 100644 --- a/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp +++ b/src/runtime_src/core/tools/common/tests/TestNPULatency.cpp @@ -9,6 +9,7 @@ #include "xrt/xrt_device.h" #include "xrt/xrt_hw_context.h" #include "xrt/xrt_kernel.h" +#include namespace XBU = XBUtilities; #include @@ -80,23 +81,35 @@ TestNPULatency::run(std::shared_ptr dev) return ptree; } - //Create BOs, the values are not initialized as they are not really used by this special test running on the device - int argno = 1; - xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++)); - argno++; - xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - //Create ctrlcode with NOPs - std::memset(bo_instr.map(), 0, buffer_size); - - //Sync BOs - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE); + xrt::xclbin::ip cu; + for (const auto& ip : xclbin.get_ips()) { + if (ip.get_type() != xrt::xclbin::ip::ip_type::ps) + continue; + cu = ip; + break; + } + + // create specified a run and populate with arguments + auto run = xrt::run(testker); + for (const auto& arg : cu.get_args()) { + auto arg_idx = static_cast(arg.get_index()); + if (arg.get_host_type() == "uint64_t") + run.set_arg(arg_idx, static_cast(1)); + else if (arg.get_host_type() == "uint32_t") + run.set_arg(arg_idx, static_cast(1)); + else if (arg.get_host_type().find('*') != std::string::npos) { + xrt::bo bo; + + if (arg.get_name() == "instruct") + bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx)); + else + bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx)); + + bo.sync(XCL_BO_SYNC_BO_TO_DEVICE); + run.set_arg(arg_idx, bo); + } + } + //Log if(XBU::getVerbose()) { logger(ptree, "Details", boost::str(boost::format("Instruction size: '%f' bytes") % buffer_size)); @@ -105,17 +118,16 @@ TestNPULatency::run(std::shared_ptr dev) // Run the test to compute latency where we submit one job at a time and wait for its completion before // we submit the next one - float elapsedSecs = 0.0; + float elapsed_secs = 0.0; try { auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < itr_count; i++) { - auto hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc); - // Wait for kernel to be done - hand.wait2(); + run.start(); + run.wait2(); } auto end = std::chrono::high_resolution_clock::now(); - elapsedSecs = std::chrono::duration_cast>(end-start).count(); + elapsed_secs = std::chrono::duration_cast>(end-start).count(); } catch (const std::exception& ex) { logger(ptree, "Error", ex.what()); @@ -123,7 +135,7 @@ TestNPULatency::run(std::shared_ptr dev) } // Calculate end-to-end latency of one job execution - const float latency = (elapsedSecs / itr_count) * 1000000; //convert s to us + const float latency = (elapsed_secs / itr_count) * 1000000; //convert s to us logger(ptree, "Details", boost::str(boost::format("Average latency: '%.1f' us") % latency)); ptree.put("status", test_token_passed); return ptree; diff --git a/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp b/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp index ceffe31986..cb7394f6f9 100644 --- a/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp +++ b/src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp @@ -9,13 +9,15 @@ #include "xrt/xrt_device.h" #include "xrt/xrt_hw_context.h" #include "xrt/xrt_kernel.h" +#include namespace XBU = XBUtilities; #include static constexpr size_t host_app = 1; //opcode static constexpr size_t buffer_size = 20; -static constexpr int itr_count_throughput = 2500; +static constexpr int run_buffer = 9; +static constexpr int itr_count_throughput = 2502; // ----- C L A S S M E T H O D S ------------------------------------------- TestNPUThroughput::TestNPUThroughput() : TestRunner("throughput", "Run end-to-end throughput test") @@ -79,23 +81,41 @@ TestNPUThroughput::run(std::shared_ptr dev) return ptree; } - //Create BOs, the values are not initialized as they are not really used by this special test running on the device - int argno = 1; - xrt::bo bo_ifm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_param(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_ofm(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_inter(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - xrt::bo bo_instr(working_dev, buffer_size, XCL_BO_FLAGS_CACHEABLE, testker.group_id(argno++)); - argno++; - xrt::bo bo_mc(working_dev, buffer_size, XRT_BO_FLAGS_HOST_ONLY, testker.group_id(argno++)); - //Create ctrlcode with NOPs - std::memset(bo_instr.map(), 0, buffer_size); - - //Sync BOs - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE); + xrt::xclbin::ip cu; + for (const auto& ip : xclbin.get_ips()) { + if (ip.get_type() != xrt::xclbin::ip::ip_type::ps) + continue; + cu = ip; + break; + } + + // create specified number of runs and populate with arguments + std::vector global_args; + std::vector run_handles; + + for (int i=0; i < run_buffer; ++i) { + auto run = xrt::run(testker); + for (const auto& arg : cu.get_args()) { + auto arg_idx = static_cast(arg.get_index()); + if (arg.get_host_type() == "uint64_t") + run.set_arg(arg_idx, static_cast(1)); + else if (arg.get_host_type() == "uint32_t") + run.set_arg(arg_idx, static_cast(1)); + else if (arg.get_host_type().find('*') != std::string::npos) { + xrt::bo bo; + + if (arg.get_name() == "instruct") + bo = xrt::bo(hwctx, arg.get_size(), xrt::bo::flags::cacheable, testker.group_id(arg_idx)); + else + bo = xrt::bo(working_dev, arg.get_size(), xrt::bo::flags::host_only, testker.group_id(arg_idx)); + + bo.sync(XCL_BO_SYNC_BO_TO_DEVICE); + global_args.push_back(bo); + run.set_arg(arg_idx, bo); + } + } + run_handles.push_back(std::move(run)); + } //Log if(XBU::getVerbose()) { @@ -106,14 +126,18 @@ TestNPUThroughput::run(std::shared_ptr dev) // Run the test to compute throughput where we saturate NPU with jobs and then wait for all // completions at the end float elapsedSecs = 0.0; - std::array runhandles; try { auto start = std::chrono::high_resolution_clock::now(); - for (auto & hand : runhandles) - hand = testker(host_app, bo_ifm, bo_param, bo_ofm, bo_inter, bo_instr, buffer_size, bo_mc); - for (const auto& hand: runhandles) - hand.wait2(); + //enqueue 9 commnds + for(int i = 0; i < run_buffer; i++) { + run_handles[i%run_buffer].start(); + } + //wait for each command to finish and add them to the queue + for(int i = 0; i < (itr_count_throughput-run_buffer); i++) { + run_handles[i%run_buffer].wait2(); + run_handles[i%run_buffer].start(); + } auto end = std::chrono::high_resolution_clock::now(); elapsedSecs = std::chrono::duration_cast>(end-start).count(); } @@ -123,7 +147,7 @@ TestNPUThroughput::run(std::shared_ptr dev) } // Compute the throughput - const double throughput = (elapsedSecs != 0.0) ? runhandles.size() / elapsedSecs : 0.0; + const double throughput = (elapsedSecs != 0.0) ? itr_count_throughput / elapsedSecs : 0.0; logger(ptree, "Details", boost::str(boost::format("Average throughput: '%.1f' ops") % throughput)); ptree.put("status", test_token_passed);