Skip to content

Commit

Permalink
Manage Tracy colors in a centralized location
Browse files Browse the repository at this point in the history
  • Loading branch information
psalz committed Dec 18, 2024
1 parent 52910f0 commit 6a2b416
Show file tree
Hide file tree
Showing 14 changed files with 137 additions and 60 deletions.
4 changes: 2 additions & 2 deletions include/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class buffer {
/// It notifies the runtime of buffer creation and destruction and also persists changes of the buffer debug name.
struct tracker {
tracker(const celerity::range<Dims>& range, const void* const host_init_ptr) : range(range) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::buffer", DarkSlateBlue);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::buffer", buffer_ctor);

if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr); }
auto user_aid = detail::null_allocation_id;
Expand All @@ -116,7 +116,7 @@ class buffer {
tracker& operator=(tracker&&) = delete;

~tracker() {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::~buffer", DarkCyan);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::~buffer", buffer_dtor);
detail::runtime::get_instance().destroy_buffer(id);
// The user must guarantee liveness of the user pointer only until the buffer instance goes out of scope
// TODO This is more synchronization than necessary - consider issuing a fence-like task that does not block concurrent tasks.
Expand Down
8 changes: 4 additions & 4 deletions include/distr_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
template <typename CGF>
void submit(CGF cgf) { // NOLINT(readability-convert-member-functions-to-static)
// (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.)
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::submit", Orange3);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::submit", distr_queue_submit);
auto cg = detail::invoke_command_group_function(std::move(cgf));
[[maybe_unused]] const auto tid = detail::runtime::get_instance().submit(std::move(cg));
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} submit", tid);
Expand All @@ -83,7 +83,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
* @warning { This is very slow, as it drains all queues and synchronizes across the entire cluster. }
*/
void slow_full_sync() { // NOLINT(readability-convert-member-functions-to-static)
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::slow_full_sync", Red2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::slow_full_sync", distr_queue_slow_full_sync);
[[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::barrier);
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} slow_full_sync", tid);
}
Expand Down Expand Up @@ -126,7 +126,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
/// It notifies the runtime of queue creation and destruction, which might trigger runtime initialization if it is the first such object.
struct tracker {
tracker(const detail::devices_or_selector& devices_or_selector) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::distr_queue", DarkSlateBlue);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::distr_queue", distr_queue_ctor);

if(!detail::runtime::has_instance()) {
detail::runtime::init(nullptr, nullptr, devices_or_selector);
Expand All @@ -144,7 +144,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue {
tracker& operator=(tracker&&) = delete;

~tracker() {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::~distr_queue", DarkCyan);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::~distr_queue", distr_queue_dtor);

detail::runtime::get_instance().destroy_queue();

Expand Down
4 changes: 2 additions & 2 deletions include/fence.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ class buffer_fence_promise final : public detail::task_promise {
template <typename T>
std::future<T> fence(const experimental::host_object<T>& obj) {
static_assert(std::is_object_v<T>, "host_object<T&> and host_object<void> are not allowed as parameters to fence()");
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", Green2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", queue_fence);

const host_object_effect effect{detail::get_host_object_id(obj), experimental::side_effect_order::sequential};
auto promise = std::make_unique<detail::host_object_fence_promise<T>>(detail::get_host_object_instance(obj));
Expand All @@ -123,7 +123,7 @@ std::future<T> fence(const experimental::host_object<T>& obj) {

template <typename DataT, int Dims>
std::future<buffer_snapshot<DataT, Dims>> fence(const buffer<DataT, Dims>& buf, const subrange<Dims>& sr) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", Green2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", queue_fence);

detail::buffer_access access{detail::get_buffer_id(buf), access_mode::read,
std::make_unique<detail::range_mapper<Dims, celerity::access::fixed<Dims>>>(celerity::access::fixed<Dims>(sr), buf.get_range())};
Expand Down
4 changes: 2 additions & 2 deletions include/host_object.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ struct host_object_tracker {
bool references_user_object;

explicit host_object_tracker(std::unique_ptr<host_object_instance> instance) : references_user_object(instance == nullptr) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("host_object::host_object", DarkSlateBlue);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("host_object::host_object", host_object_ctor);
if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr); }
id = detail::runtime::get_instance().create_host_object(std::move(instance));
}
Expand All @@ -49,7 +49,7 @@ struct host_object_tracker {
host_object_tracker& operator=(const host_object_tracker&) = delete;

~host_object_tracker() {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("~host_object::host_object", DarkCyan);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("~host_object::host_object", host_object_dtor);
detail::runtime::get_instance().destroy_host_object(id);
// The user must guarantee liveness of the referenced object only until the host_object instance goes out of scope
if(references_user_object) { detail::runtime::get_instance().sync(detail::epoch_action::none); }
Expand Down
10 changes: 5 additions & 5 deletions include/queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class queue {
template <typename CGF>
void submit(CGF&& cgf) { // NOLINT(readability-convert-member-functions-to-static)
// (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.)
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::submit", Orange3);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::submit", queue_submit);
auto cg = detail::invoke_command_group_function(std::forward<CGF>(cgf));
[[maybe_unused]] const auto tid = detail::runtime::get_instance().submit(std::move(cg));
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} submit", tid);
Expand All @@ -49,7 +49,7 @@ class queue {
/// Note that this overload of `wait` does not issue a global barrier, so when using this for simple user-side benchmarking, cluster nodes might disagree on
/// start time measurements. Use `wait(experimental::barrier)` instead for benchmarking purposes.
void wait() { // NOLINT(readability-convert-member-functions-to-static)
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", Red2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", queue_wait);
[[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::none);
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} wait", tid);
}
Expand All @@ -58,7 +58,7 @@ class queue {
///
/// This has an even higher latency than `wait()`, but may be useful for user-side performance measurements.
void wait(detail::barrier_tag /* barrier */) { // NOLINT(readability-convert-member-functions-to-static)
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", Red2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", queue_wait);
[[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::barrier);
CELERITY_DETAIL_TRACY_ZONE_NAME("T{} wait (barrier)", tid);
}
Expand Down Expand Up @@ -95,7 +95,7 @@ class queue {
/// It notifies the runtime of queue creation and destruction, which might trigger runtime initialization if it is the first such object.
struct tracker {
tracker() {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::queue", DarkSlateBlue);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::queue", queue_ctor);
if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr, detail::auto_select_devices{}); }
detail::runtime::get_instance().create_queue();
}
Expand All @@ -106,7 +106,7 @@ class queue {
tracker& operator=(tracker&&) = delete;

~tracker() {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::~queue", DarkCyan);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::~queue", queue_ctor);

detail::runtime::get_instance().destroy_queue();

Expand Down
81 changes: 79 additions & 2 deletions include/tracy.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,86 @@ inline const char* leak_name(const std::string& name) {
inline void set_thread_name_and_order(const std::string& name, const int32_t index) {
const int32_t order = tracy_detail::lane_order::thread + index;
assert(order <= static_cast<int32_t>(tracy_detail::lane_order::thread_max));
::tracy::SetThreadNameWithHint(leak_name(name), order);
tracy::SetThreadNameWithHint(leak_name(name), order);
}

} // namespace celerity::detail::tracy_detail

namespace celerity::detail {

enum class trace_color : std::underlying_type_t<tracy::Color::ColorType> {
generic_red = tracy::Color::Red,
generic_green = tracy::Color::Green,
generic_blue = tracy::Color::Blue,
generic_yellow = tracy::Color::Yellow,

buffer_ctor = tracy::Color::DarkSlateBlue,
buffer_dtor = tracy::Color::DarkCyan,

cuda_memcpy = tracy::Color::ForestGreen,
cuda_memcpy_1d = cuda_memcpy,
cuda_memcpy_2d = cuda_memcpy,
cuda_memcpy_3d = cuda_memcpy,
cuda_record_event = tracy::Color::ForestGreen,

distr_queue_ctor = tracy::Color::DarkSlateBlue,
distr_queue_dtor = tracy::Color::DarkCyan,
distr_queue_slow_full_sync = tracy::Color::Red2,
distr_queue_submit = tracy::Color::Orange3,

executor_fetch = tracy::Color::Gray,
executor_issue = tracy::Color::Blue,
executor_issue_copy = tracy::Color::Green4,
executor_issue_device_kernel = tracy::Color::Yellow2,
executor_make_accessor_info = tracy::Color::Magenta3,
executor_oob_check = tracy::Color::Red,
executor_oob_init = executor_oob_check,
executor_retire = tracy::Color::Brown,
executor_starve = tracy::Color::DarkSlateGray,

host_object_ctor = tracy::Color::DarkSlateBlue,
host_object_dtor = tracy::Color::DarkCyan,

iggen_allocate = tracy::Color::Teal,
iggen_anticipate = iggen_allocate,
iggen_coherence = tracy::Color::Red2,
iggen_launch_kernel = tracy::Color::Blue2,
iggen_perform_buffer_access = tracy::Color::Red3,
iggen_satisfy_buffer_requirements = tracy::Color::ForestGreen,
iggen_split_task = tracy::Color::Maroon,

mpi_finalize = tracy::Color::LightSkyBlue,
mpi_init = tracy::Color::LightSkyBlue,

out_of_order_engine_assign = tracy::Color::Blue3,
out_of_order_engine_complete = tracy::Color::Blue3,
out_of_order_engine_submit = tracy::Color::Blue3,

queue_ctor = distr_queue_ctor,
queue_dtor = distr_queue_dtor,
queue_fence = tracy::Color::Green2,
queue_submit = distr_queue_submit,
queue_wait = distr_queue_slow_full_sync,

runtime_select_devices = tracy::Color::PaleVioletRed,
runtime_shutdown = tracy::Color::DimGray,
runtime_startup = tracy::Color::DarkGray,

scheduler_buffer_created = tracy::Color::DarkGreen,
scheduler_buffer_destroyed = scheduler_buffer_created,
scheduler_buffer_name_changed = tracy::Color::DarkGreen,
scheduler_build_task = tracy::Color::WebMaroon,
scheduler_compile_command = tracy::Color::MidnightBlue,
scheduler_host_object_created = tracy::Color::DarkGreen,
scheduler_host_object_destroyed = scheduler_host_object_created,
scheduler_prune = tracy::Color::Gray,

sycl_init = tracy::Color::Orange2,
sycl_submit = tracy::Color::Orange2,
};

}

#define CELERITY_DETAIL_IF_TRACY_SUPPORTED(...) __VA_ARGS__

#else
Expand All @@ -100,7 +175,9 @@ inline void set_thread_name_and_order(const std::string& name, const int32_t ind
#define CELERITY_DETAIL_IF_TRACY_ENABLED_FULL(...) CELERITY_DETAIL_IF_TRACY_SUPPORTED(if(::celerity::detail::tracy_detail::is_enabled_full()) { __VA_ARGS__; })

#define CELERITY_DETAIL_TRACY_ZONE_SCOPED(TAG, COLOR_NAME) \
CELERITY_DETAIL_IF_TRACY_SUPPORTED(ZoneNamedNC(___tracy_scoped_zone, TAG, ::tracy::Color::COLOR_NAME, ::celerity::detail::tracy_detail::is_enabled()))
CELERITY_DETAIL_IF_TRACY_SUPPORTED(ZoneNamedNC(___tracy_scoped_zone, TAG, \
static_cast<std::underlying_type_t<::celerity::detail::trace_color>>(::celerity::detail::trace_color::COLOR_NAME), \
::celerity::detail::tracy_detail::is_enabled()))

#define CELERITY_DETAIL_TRACY_ZONE_NAME(...) \
CELERITY_DETAIL_IF_TRACY_ENABLED_FULL(::celerity::detail::tracy_detail::apply_string([&](const auto& n) { ZoneName(n.data(), n.size()); }, __VA_ARGS__))
Expand Down
4 changes: 2 additions & 2 deletions src/backend/sycl_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ sycl_backend::~sycl_backend() {
const system_info& sycl_backend::get_system_info() const { return m_impl->system; }

void sycl_backend::init() {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", Orange2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", sycl_init);

// Instantiate the first in-order queue on each device. At least for CUDA systems this will perform device initialization, which can take > 100 ms / device.
for(device_id did = 0; did < m_impl->system.devices.size(); ++did) {
Expand Down Expand Up @@ -276,7 +276,7 @@ async_event sycl_backend::enqueue_device_kernel(const device_id device, const si
std::vector<closure_hydrator::accessor_info> accessor_infos, const box<3>& execution_range, const std::vector<void*>& reduction_ptrs) //
{
return enqueue_device_work(device, lane, [=, this, acc_infos = std::move(accessor_infos)](sycl::queue& queue) mutable {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
auto event = queue.submit([&](sycl::handler& sycl_cgh) {
auto& hydrator = closure_hydrator::get_instance();
hydrator.arm(target::device, std::move(acc_infos));
Expand Down
10 changes: 5 additions & 5 deletions src/backend/sycl_cuda_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@ void nd_copy_device_async(const cudaStream_t stream, const void* const source_ba
if(layout.contiguous_size == 0) return;

if(layout.num_complex_strides == 0) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_1d", ForestGreen, "cudaMemcpyAsync");
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_1d", cuda_memcpy_1d, "cudaMemcpyAsync");
CELERITY_CUDA_CHECK(cudaMemcpyAsync, static_cast<std::byte*>(dest_base) + layout.offset_in_dest,
static_cast<const std::byte*>(source_base) + layout.offset_in_source, layout.contiguous_size, cudaMemcpyDefault, stream);
} else if(layout.num_complex_strides == 1) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_2d", ForestGreen, "cudaMemcpy2DAsync");
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_2d", cuda_memcpy_2d, "cudaMemcpy2DAsync");
CELERITY_CUDA_CHECK(cudaMemcpy2DAsync, static_cast<std::byte*>(dest_base) + layout.offset_in_dest, layout.strides[0].dest_stride,
static_cast<const std::byte*>(source_base) + layout.offset_in_source, layout.strides[0].source_stride, layout.contiguous_size,
layout.strides[0].count, cudaMemcpyDefault, stream);
} else {
assert(layout.num_complex_strides == 2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_3d", ForestGreen, "cudaMemcpy3DAsync");
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_3d", cuda_memcpy_3d, "cudaMemcpy3DAsync");
// Arriving in the 3D case means no dimensionality reduction was possible, and cudaMemcpy3D is more closely aligned to the parameters to
// nd_copy_device_async than to nd_copy_layout, so we don't compute cudaMemcpy3DParms from `layout`.
cudaMemcpy3DParms parms = {};
Expand Down Expand Up @@ -81,7 +81,7 @@ void nd_copy_device_async(cudaStream_t stream, const void* const source_base, vo
nd_copy_device_async(stream, source, dest, source_box, dest_box, copy_box, elem_size);
},
[stream](const void* const source, void* const dest, size_t size_bytes) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy", ForestGreen, "cudaMemcpyAsync");
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy", cuda_memcpy, "cudaMemcpyAsync");
CELERITY_CUDA_CHECK(cudaMemcpyAsync, dest, source, size_bytes, cudaMemcpyDefault, stream);
});
}
Expand All @@ -101,7 +101,7 @@ struct cuda_native_event_deleter {
using unique_cuda_native_event = std::unique_ptr<std::remove_pointer_t<cudaEvent_t>, cuda_native_event_deleter>;

unique_cuda_native_event record_native_event(const cudaStream_t stream, bool enable_profiling) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::record_event", ForestGreen, "cudaEventRecord")
CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::record_event", cuda_record_event, "cudaEventRecord")
cudaEvent_t event;
CELERITY_CUDA_CHECK(cudaEventCreateWithFlags, &event, enable_profiling ? cudaEventDefault : cudaEventDisableTiming);
CELERITY_CUDA_CHECK(cudaEventRecord, event, stream);
Expand Down
4 changes: 2 additions & 2 deletions src/backend/sycl_generic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ void nd_copy_device_chunked(sycl::queue& queue, const void* const source_base, v
const auto layout = layout_nd_copy(source_box.get_range(), dest_box.get_range(), copy_box.get_offset() - source_box.get_offset(),
copy_box.get_offset() - dest_box.get_offset(), copy_box.get_range(), elem_size);
for_each_contiguous_chunk(layout, [&](const size_t chunk_offset_in_source, const size_t chunk_offset_in_dest, const size_t chunk_size) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
// first, last: We remember the first and last submission event to report completion time spanning the entire region copy
last = queue.memcpy(
static_cast<std::byte*>(dest_base) + chunk_offset_in_dest, static_cast<const std::byte*>(source_base) + chunk_offset_in_source, chunk_size);
Expand All @@ -47,7 +47,7 @@ async_event nd_copy_device_generic(sycl::queue& queue, const void* const source_
[&queue, elem_size, enable_profiling, &first, &last](const void* const source, void* const dest, const box<3>& source_box, const box<3>& dest_box,
const box<3>& copy_box) { nd_copy_device_chunked(queue, source, dest, source_box, dest_box, copy_box, elem_size, enable_profiling, first, last); },
[&queue, enable_profiling, &first, &last](const void* const source, void* const dest, size_t size_bytes) {
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2);
CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit);
last = queue.memcpy(dest, source, size_bytes);
if(enable_profiling) { first = last; }
});
Expand Down
Loading

0 comments on commit 6a2b416

Please sign in to comment.