Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xrt-smi validate test for spatial-sharing overhead (VITIS-13000) #8395

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
17 changes: 17 additions & 0 deletions src/runtime_src/core/tools/common/TestRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,20 @@ TestRunner::get_test_header()
ptree.put("explicit", m_explicit);
return ptree;
}

// Method to wait for threads to be ready
// Parameters:
// - thread_num: Number of threads to wait for
// - mut: Mutex to lock the critical section
// - cond_var: Condition variable to wait on
// - thread_ready: Counter to track the number of ready threads
void TestRunner::wait_for_threads_ready(uint32_t thread_num, std::mutex& mut, std::condition_variable& cond_var, uint32_t& thread_ready) {
std::unique_lock<std::mutex> lock(mut);
while (thread_ready != thread_num) {
lock.unlock();
std::this_thread::sleep_for(std::chrono::microseconds(10));
lock.lock();
}
cond_var.notify_all();
lock.unlock();
}
1 change: 1 addition & 0 deletions src/runtime_src/core/tools/common/TestRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class TestRunner : public JSONConfigurable {
const std::string & getConfigName() const { return get_name(); };
virtual const std::string& getConfigDescription() const { return m_description; };
boost::property_tree::ptree get_test_header();
void wait_for_threads_ready(uint32_t thread_num, std::mutex& mut, std::condition_variable& cond_var, uint32_t& thread_ready);

// Child class helper methods
protected:
Expand Down
100 changes: 100 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestHelper.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.

This comment was marked as resolved.


// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestHelper.h"

// Constructor for BO_set
// BO_set is a collection of all the buffer objects so that the operations on all buffers can be done from a single object
// Parameters:
// - device: Reference to the xrt::device object
// - kernel: Reference to the xrt::kernel object
BO_set::BO_set(xrt::device& device, xrt::kernel& kernel, size_t buffer_size)
: buffer_size(buffer_size)
{
// Initialize buffer objects with appropriate flags and group IDs
bo_instr = xrt::bo(device, buffer_size, XCL_BO_FLAGS_CACHEABLE, kernel.group_id(5));
bo_ifm = xrt::bo(device, buffer_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(1));
bo_param = xrt::bo(device, buffer_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
bo_ofm = xrt::bo(device, buffer_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
bo_inter = xrt::bo(device, buffer_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
bo_mc = xrt::bo(device, buffer_size, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(7));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use member initializer list to construct the bo objects

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good idea !
I have updated the latest patch with this change.
Thanks


// no-op instruction buffer
std::memset(bo_instr.map<char*>(), (uint8_t)0, buffer_size);
}

// Method to synchronize buffer objects to the device
void BO_set::sync_bos_to_device() {
bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_ifm.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_param.sync(XCL_BO_SYNC_BO_TO_DEVICE);
bo_mc.sync(XCL_BO_SYNC_BO_TO_DEVICE);
}

// Method to set kernel arguments
// Parameters:
// - run: Reference to the xrt::run object
void BO_set::set_kernel_args(xrt::run& run) {
uint64_t opcode = 1;
run.set_arg(0, opcode);
run.set_arg(1, bo_ifm);
run.set_arg(2, bo_param);
run.set_arg(3, bo_ofm);
run.set_arg(4, bo_inter);
run.set_arg(5, bo_instr);
run.set_arg(6, buffer_size/sizeof(int));
run.set_arg(7, bo_mc);
}


// Method to run the test case
// Parameters:
// - mut: Mutex to lock the critical section
// - cond_var: Condition variable to wait on
// - thread_ready: Counter to track the number of ready threads
void
TestCase::run(std::mutex& mut, std::condition_variable& cond_var, uint32_t& thread_ready)
{
std::vector<xrt::kernel> kernels;
std::vector<BO_set> bo_set_list;
std::vector<xrt::run> run_list;

// Initialize kernels, buffer objects, and runs
for (uint32_t j = 0; j < queue_len; j++) {
auto kernel = xrt::kernel(hw_ctx, kernel_name);
auto bos = BO_set(device, kernel, buffer_size);
bos.sync_bos_to_device();
auto run = xrt::run(kernel);
bos.set_kernel_args(run);
run.start();
run.wait2();

kernels.push_back(kernel);
bo_set_list.push_back(bos);
run_list.push_back(run);
}

// Signal that the current thread is ready to run
thread_ready_to_run(mut, cond_var, thread_ready);

for (int i = 0; i < itr_count; i++) {
// Start all runs in the queue so that they run in parallel
for (uint32_t cnt = 0; cnt < queue_len; cnt++) {
run_list[cnt].start();
}
// Wait for all runs in the queue to complete
for (uint32_t cnt = 0; cnt < queue_len; cnt++) {
run_list[cnt].wait2();
}
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you should move the for loop at line 80 into a thread function that you kick off as part of thread constructor in TestSpatialSharingOvd::run(). The timer can be started just before the threads are constructed.

You shouldn't need thread_ready_to_run();


// Method to signal that a thread is ready to run
void TestCase::thread_ready_to_run(std::mutex& mut, std::condition_variable& cond_var, uint32_t& thread_ready) {
std::unique_lock<std::mutex> lock(mut);
thread_ready++;
cond_var.wait(lock);
lock.unlock();
}
55 changes: 55 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestHelper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.


// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "tools/common/TestRunner.h"

// Class representing a set of buffer objects (BOs)
class BO_set {
xrt::bo bo_instr; // Buffer object for instructions
xrt::bo bo_ifm; // Buffer object for input feature map
xrt::bo bo_param; // Buffer object for parameters
xrt::bo bo_ofm; // Buffer object for output feature map
xrt::bo bo_inter; // Buffer object for intermediate data
xrt::bo bo_mc; // Buffer object for memory controller
uint32_t instr_size; // Size of the instruction buffer
size_t buffer_size; // Size of the buffer

public:
// Constructor to initialize buffer objects
BO_set(xrt::device&, xrt::kernel&, size_t);

// Method to set kernel arguments
void set_kernel_args(xrt::run&);

// Method to synchronize buffer objects to the device
void sync_bos_to_device();
};


// Class representing a test case, which is created for a single run on a single thread//
class TestCase {
xrt::device device; // Device object
xrt::xclbin xclbin; // Xclbin object
std::string kernel_name; // Name of the kernel
xrt::hw_context hw_ctx; // Hardware context
uint32_t queue_len = 4; // Queue length
size_t buffer_size; // Size of the buffer
int itr_count; // Number of iterations

// Method to signal that a thread is ready to run
void thread_ready_to_run(std::mutex&, std::condition_variable&, uint32_t&);

public:
// Constructor to initialize the test case with xclbin and kernel name with hardware context creation
TestCase(xrt::xclbin& xclbin, std::string& kernel, size_t buffer_size = 1024)
: device(xrt::device(0)), xclbin(xclbin), kernel_name(kernel), buffer_size(buffer_size), itr_count(1000)
aktondak marked this conversation as resolved.
Show resolved Hide resolved
{
device.register_xclbin(xclbin);
hw_ctx = xrt::hw_context(device, xclbin.get_uuid());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

member initializer list please.

}

void run(std::mutex&, std::condition_variable&, uint32_t&);
};

150 changes: 150 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestSpatialSharingOvd.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

// ------ I N C L U D E F I L E S -------------------------------------------
// Local - Include Files
#include "TestSpatialSharingOvd.h"
#include "TestHelper.h"
#include "tools/common/XBUtilities.h"
#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_hw_context.h"
#include "xrt/xrt_kernel.h"
#include <thread>

namespace XBU = XBUtilities;

static constexpr size_t host_app = 1; //opcode

// Method to run the test
// Parameters:
// - dev: Shared pointer to the device
// Returns:
// - Property tree containing the test results
boost::property_tree::ptree TestSpatialSharingOvd::run(std::shared_ptr<xrt_core::device> dev) {
// Clear any existing "xclbin" entry in the property tree
ptree.erase("xclbin");

// Query the xclbin name from the device
const auto xclbin_name = xrt_core::device_query<xrt_core::query::xclbin_name>(dev, xrt_core::query::xclbin_name::type::validate);

// Find the platform file path for the xclbin
auto xclbin_path = findPlatformFile(xclbin_name, ptree);

// If the xclbin file does not exist, return the property tree
if (!std::filesystem::exists(xclbin_path))
return ptree;

// Log the xclbin path
logger(ptree, "Xclbin", xclbin_path);

// Create an xclbin object
xrt::xclbin xclbin;
try {
// Load the xclbin file
xclbin = xrt::xclbin(xclbin_path);
}
catch (const std::runtime_error& ex) {
// Log any runtime error and set the status to failed
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
return ptree;
}

// Determine The DPU Kernel Name
auto xkernels = xclbin.get_kernels();

// Find the first kernel whose name starts with "DPU"
auto itr = std::find_if(xkernels.begin(), xkernels.end(), [](xrt::xclbin::kernel& k) {
auto name = k.get_name();
return name.rfind("DPU",0) == 0; // Starts with "DPU"
});

xrt::xclbin::kernel xkernel;
if (itr != xkernels.end())
xkernel = *itr;
else {
// Log an error if no kernel with "DPU" is found and set the status to failed
logger(ptree, "Error", "No kernel with `DPU` found in the xclbin");
ptree.put("status", test_token_failed);
return ptree;
}

// Get the name of the found kernel
auto kernelName = xkernel.get_name();

// If verbose mode is enabled, log the kernel name
if(XBU::getVerbose())
logger(ptree, "Details", boost::str(boost::format("Kernel name is '%s'") % kernelName));

// Create a working device from the provided device
auto working_dev = xrt::device(dev);
working_dev.register_xclbin(xclbin);

std::mutex mut;
std::condition_variable cond_var;
uint32_t thread_ready = 0;

/* Run 1 */
std::vector<std::thread> threads;
std::vector<TestCase> testcases;

// Create two test cases and add them to the vector
testcases.emplace_back(xclbin, kernelName);
testcases.emplace_back(xclbin, kernelName);

// Lambda function to run a test case. This will be sent to individual thread to be run.
auto runTestcase = [&](TestCase& test) {
try {
test.run(mut, cond_var, thread_ready);
} catch (const std::exception& ex) {
logger(ptree, "Error", ex.what());
ptree.put("status", test_token_failed);
return;
}
};

// Create two threads to run the test cases
threads.emplace_back(runTestcase, std::ref(testcases[0]));
threads.emplace_back(runTestcase, std::ref(testcases[1]));

// Wait for both threads to be ready to begin clocking
wait_for_threads_ready((uint32_t)threads.size(), mut, cond_var, thread_ready);
aktondak marked this conversation as resolved.
Show resolved Hide resolved

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not needed.

// Measure the latency for running the test cases in parallel
auto start = std::chrono::high_resolution_clock::now();
for (uint32_t i = 0; i < threads.size(); i++) {
threads[i].join();
}
auto end = std::chrono::high_resolution_clock::now();
float latencyShared = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
/* End of Run 1 */

thread_ready = 0;

/* Run 2 */
// Create a single test case and run it in a single thread
TestCase t(xclbin, kernelName);
std::thread thr(runTestcase, std::ref(t));

// Wait for the thread to be ready
wait_for_threads_ready(1, mut, cond_var, thread_ready);

// Measure the latency for running the test case in a single thread
start = std::chrono::high_resolution_clock::now();
thr.join();
end = std::chrono::high_resolution_clock::now();
float latencySingle = std::chrono::duration_cast<std::chrono::duration<float>>(end-start).count();
/* End of Run 2 */

// Log the latencies and the overhead
if(XBU::getVerbose()){
logger(ptree, "Details", boost::str(boost::format("LatencySingle: '%.1f' ms") % (latencySingle * 1000)));
logger(ptree, "Details", boost::str(boost::format("LatencyShared: '%.1f' ms") % (latencyShared * 1000)));
aktondak marked this conversation as resolved.
Show resolved Hide resolved
}
logger(ptree, "Details", boost::str(boost::format("Overhead: '%.1f' ms") % ((latencyShared - latencySingle) * 1000)));

// Set the test status to passed
ptree.put("status", test_token_passed);
return ptree;
}
38 changes: 38 additions & 0 deletions src/runtime_src/core/tools/common/tests/TestSpatialSharingOvd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.

#ifndef _TESTSPATIALSHARINGOVD_
#define _TESTSPATIALSHARINGOVD_

#include "tools/common/TestRunner.h"

/**
* @brief Test control flow:
*
* Two threads are spawned to run two instances of testcases concurrently. There's one hardware context created on each thread so
* the 2 threads are doing spatial sharing. The threads are added to the threads vector and started using the runTestcase() lambda.
* The program then waits for both threads to signal they are ready using wait_for_threads_ready().
* Once ready, the latency for running the test cases in parallel is measured by recording the start and
* end times around the join calls for each thread. After the first run, the thread_ready counter is reset,
* and a second run is performed with a single TestCase instance executed in a single thread without spatial sharing.
* The latency for this single-threaded run is similarly measured.
* Finally, the latencies for both runs are logged to assess the overhead of running the test cases in parallel versus sequentially.
*
* @see runTestcase()
* @see wait_for_threads_ready()
* @see std::thread::join()
*/

// Class representing the TestSpatialSharingOvd test
class TestSpatialSharingOvd : public TestRunner {
public:
boost::property_tree::ptree ptree;

boost::property_tree::ptree run(std::shared_ptr<xrt_core::device> dev);

// Constructor to initialize the test runner with a name and description
TestSpatialSharingOvd()
: TestRunner("spatial-sharing-overhead", "Run Spatial Sharing Overhead Test"), ptree(get_test_header()){}
};

#endif
4 changes: 3 additions & 1 deletion src/runtime_src/core/tools/xbutil2/SubCmdValidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "tools/common/tests/TestCmdChainLatency.h"
#include "tools/common/tests/TestCmdChainThroughput.h"
#include "tools/common/tests/TestAIEReconfigOverhead.h"
#include "tools/common/tests/TestSpatialSharingOvd.h"
namespace XBU = XBUtilities;

// 3rd Party Library - Include Files
Expand Down Expand Up @@ -114,7 +115,8 @@ std::vector<std::shared_ptr<TestRunner>> testSuite = {
std::make_shared<TestNPULatency>(),
std::make_shared<TestCmdChainLatency>(),
std::make_shared<TestCmdChainThroughput>(),
std::make_shared<TestAIEReconfigOverhead>()
std::make_shared<TestAIEReconfigOverhead>(),
std::make_shared<TestSpatialSharingOvd>()
};

/*
Expand Down
Loading
Loading