diff --git a/include/buffer.h b/include/buffer.h index 2c5384b9..f14ec91b 100644 --- a/include/buffer.h +++ b/include/buffer.h @@ -98,7 +98,7 @@ class buffer { /// It notifies the runtime of buffer creation and destruction and also persists changes of the buffer debug name. struct tracker { tracker(const celerity::range& range, const void* const host_init_ptr) : range(range) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::buffer", DarkSlateBlue); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::buffer", buffer_ctor); if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr); } auto user_aid = detail::null_allocation_id; @@ -116,7 +116,7 @@ class buffer { tracker& operator=(tracker&&) = delete; ~tracker() { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::~buffer", DarkCyan); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("buffer::~buffer", buffer_dtor); detail::runtime::get_instance().destroy_buffer(id); // The user must guarantee liveness of the user pointer only until the buffer instance goes out of scope // TODO This is more synchronization than necessary - consider issuing a fence-like task that does not block concurrent tasks. diff --git a/include/distr_queue.h b/include/distr_queue.h index 1eaa644a..35b4a7a3 100644 --- a/include/distr_queue.h +++ b/include/distr_queue.h @@ -69,7 +69,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue { template void submit(CGF cgf) { // NOLINT(readability-convert-member-functions-to-static) // (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.) - CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::submit", Orange3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::submit", distr_queue_submit); auto cg = detail::invoke_command_group_function(std::move(cgf)); [[maybe_unused]] const auto tid = detail::runtime::get_instance().submit(std::move(cg)); CELERITY_DETAIL_TRACY_ZONE_NAME("T{} submit", tid); @@ -83,7 +83,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue { * @warning { This is very slow, as it drains all queues and synchronizes across the entire cluster. } */ void slow_full_sync() { // NOLINT(readability-convert-member-functions-to-static) - CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::slow_full_sync", Red2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::slow_full_sync", distr_queue_slow_full_sync); [[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::barrier); CELERITY_DETAIL_TRACY_ZONE_NAME("T{} slow_full_sync", tid); } @@ -126,7 +126,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue { /// It notifies the runtime of queue creation and destruction, which might trigger runtime initialization if it is the first such object. struct tracker { tracker(const detail::devices_or_selector& devices_or_selector) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::distr_queue", DarkSlateBlue); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::distr_queue", distr_queue_ctor); if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr, devices_or_selector); @@ -144,7 +144,7 @@ class [[deprecated("Use celerity::queue instead")]] distr_queue { tracker& operator=(tracker&&) = delete; ~tracker() { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::~distr_queue", DarkCyan); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("distr_queue::~distr_queue", distr_queue_dtor); detail::runtime::get_instance().destroy_queue(); diff --git a/include/fence.h b/include/fence.h index a5163a13..9c745a32 100644 --- a/include/fence.h +++ b/include/fence.h @@ -110,7 +110,7 @@ class buffer_fence_promise final : public detail::task_promise { template std::future fence(const experimental::host_object& obj) { static_assert(std::is_object_v, "host_object and host_object are not allowed as parameters to fence()"); - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", Green2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", queue_fence); const host_object_effect effect{detail::get_host_object_id(obj), experimental::side_effect_order::sequential}; auto promise = std::make_unique>(detail::get_host_object_instance(obj)); @@ -123,7 +123,7 @@ std::future fence(const experimental::host_object& obj) { template std::future> fence(const buffer& buf, const subrange& sr) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", Green2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::fence", queue_fence); detail::buffer_access access{detail::get_buffer_id(buf), access_mode::read, std::make_unique>>(celerity::access::fixed(sr), buf.get_range())}; diff --git a/include/host_object.h b/include/host_object.h index c58eb3bc..22e34b43 100644 --- a/include/host_object.h +++ b/include/host_object.h @@ -38,7 +38,7 @@ struct host_object_tracker { bool references_user_object; explicit host_object_tracker(std::unique_ptr instance) : references_user_object(instance == nullptr) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("host_object::host_object", DarkSlateBlue); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("host_object::host_object", host_object_ctor); if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr); } id = detail::runtime::get_instance().create_host_object(std::move(instance)); } @@ -49,7 +49,7 @@ struct host_object_tracker { host_object_tracker& operator=(const host_object_tracker&) = delete; ~host_object_tracker() { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("~host_object::host_object", DarkCyan); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("~host_object::host_object", host_object_dtor); detail::runtime::get_instance().destroy_host_object(id); // The user must guarantee liveness of the referenced object only until the host_object instance goes out of scope if(references_user_object) { detail::runtime::get_instance().sync(detail::epoch_action::none); } diff --git a/include/queue.h b/include/queue.h index 989fedb9..6093de91 100644 --- a/include/queue.h +++ b/include/queue.h @@ -36,7 +36,7 @@ class queue { template void submit(CGF&& cgf) { // NOLINT(readability-convert-member-functions-to-static) // (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.) - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::submit", Orange3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::submit", queue_submit); auto cg = detail::invoke_command_group_function(std::forward(cgf)); [[maybe_unused]] const auto tid = detail::runtime::get_instance().submit(std::move(cg)); CELERITY_DETAIL_TRACY_ZONE_NAME("T{} submit", tid); @@ -49,7 +49,7 @@ class queue { /// Note that this overload of `wait` does not issue a global barrier, so when using this for simple user-side benchmarking, cluster nodes might disagree on /// start time measurements. Use `wait(experimental::barrier)` instead for benchmarking purposes. void wait() { // NOLINT(readability-convert-member-functions-to-static) - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", Red2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", queue_wait); [[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::none); CELERITY_DETAIL_TRACY_ZONE_NAME("T{} wait", tid); } @@ -58,7 +58,7 @@ class queue { /// /// This has an even higher latency than `wait()`, but may be useful for user-side performance measurements. void wait(detail::barrier_tag /* barrier */) { // NOLINT(readability-convert-member-functions-to-static) - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", Red2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::wait", queue_wait); [[maybe_unused]] const auto tid = detail::runtime::get_instance().sync(detail::epoch_action::barrier); CELERITY_DETAIL_TRACY_ZONE_NAME("T{} wait (barrier)", tid); } @@ -95,7 +95,7 @@ class queue { /// It notifies the runtime of queue creation and destruction, which might trigger runtime initialization if it is the first such object. struct tracker { tracker() { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::queue", DarkSlateBlue); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::queue", queue_ctor); if(!detail::runtime::has_instance()) { detail::runtime::init(nullptr, nullptr, detail::auto_select_devices{}); } detail::runtime::get_instance().create_queue(); } @@ -106,7 +106,7 @@ class queue { tracker& operator=(tracker&&) = delete; ~tracker() { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::~queue", DarkCyan); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("queue::~queue", queue_ctor); detail::runtime::get_instance().destroy_queue(); diff --git a/include/tracy.h b/include/tracy.h index 2ac14ccf..5c03b617 100644 --- a/include/tracy.h +++ b/include/tracy.h @@ -82,11 +82,86 @@ inline const char* leak_name(const std::string& name) { inline void set_thread_name_and_order(const std::string& name, const int32_t index) { const int32_t order = tracy_detail::lane_order::thread + index; assert(order <= static_cast(tracy_detail::lane_order::thread_max)); - ::tracy::SetThreadNameWithHint(leak_name(name), order); + tracy::SetThreadNameWithHint(leak_name(name), order); } } // namespace celerity::detail::tracy_detail +namespace celerity::detail { + +enum class trace_color : std::underlying_type_t { + generic_red = tracy::Color::Red, + generic_green = tracy::Color::Green, + generic_blue = tracy::Color::Blue, + generic_yellow = tracy::Color::Yellow, + + buffer_ctor = tracy::Color::DarkSlateBlue, + buffer_dtor = tracy::Color::DarkCyan, + + cuda_memcpy = tracy::Color::ForestGreen, + cuda_memcpy_1d = cuda_memcpy, + cuda_memcpy_2d = cuda_memcpy, + cuda_memcpy_3d = cuda_memcpy, + cuda_record_event = tracy::Color::ForestGreen, + + distr_queue_ctor = tracy::Color::DarkSlateBlue, + distr_queue_dtor = tracy::Color::DarkCyan, + distr_queue_slow_full_sync = tracy::Color::Red2, + distr_queue_submit = tracy::Color::Orange3, + + executor_fetch = tracy::Color::Gray, + executor_issue = tracy::Color::Blue, + executor_issue_copy = tracy::Color::Green4, + executor_issue_device_kernel = tracy::Color::Yellow2, + executor_make_accessor_info = tracy::Color::Magenta3, + executor_oob_check = tracy::Color::Red, + executor_oob_init = executor_oob_check, + executor_retire = tracy::Color::Brown, + executor_starve = tracy::Color::DarkSlateGray, + + host_object_ctor = tracy::Color::DarkSlateBlue, + host_object_dtor = tracy::Color::DarkCyan, + + iggen_allocate = tracy::Color::Teal, + iggen_anticipate = iggen_allocate, + iggen_coherence = tracy::Color::Red2, + iggen_launch_kernel = tracy::Color::Blue2, + iggen_perform_buffer_access = tracy::Color::Red3, + iggen_satisfy_buffer_requirements = tracy::Color::ForestGreen, + iggen_split_task = tracy::Color::Maroon, + + mpi_finalize = tracy::Color::LightSkyBlue, + mpi_init = tracy::Color::LightSkyBlue, + + out_of_order_engine_assign = tracy::Color::Blue3, + out_of_order_engine_complete = tracy::Color::Blue3, + out_of_order_engine_submit = tracy::Color::Blue3, + + queue_ctor = distr_queue_ctor, + queue_dtor = distr_queue_dtor, + queue_fence = tracy::Color::Green2, + queue_submit = distr_queue_submit, + queue_wait = distr_queue_slow_full_sync, + + runtime_select_devices = tracy::Color::PaleVioletRed, + runtime_shutdown = tracy::Color::DimGray, + runtime_startup = tracy::Color::DarkGray, + + scheduler_buffer_created = tracy::Color::DarkGreen, + scheduler_buffer_destroyed = scheduler_buffer_created, + scheduler_buffer_name_changed = tracy::Color::DarkGreen, + scheduler_build_task = tracy::Color::WebMaroon, + scheduler_compile_command = tracy::Color::MidnightBlue, + scheduler_host_object_created = tracy::Color::DarkGreen, + scheduler_host_object_destroyed = scheduler_host_object_created, + scheduler_prune = tracy::Color::Gray, + + sycl_init = tracy::Color::Orange2, + sycl_submit = tracy::Color::Orange2, +}; + +} + #define CELERITY_DETAIL_IF_TRACY_SUPPORTED(...) __VA_ARGS__ #else @@ -100,7 +175,9 @@ inline void set_thread_name_and_order(const std::string& name, const int32_t ind #define CELERITY_DETAIL_IF_TRACY_ENABLED_FULL(...) CELERITY_DETAIL_IF_TRACY_SUPPORTED(if(::celerity::detail::tracy_detail::is_enabled_full()) { __VA_ARGS__; }) #define CELERITY_DETAIL_TRACY_ZONE_SCOPED(TAG, COLOR_NAME) \ - CELERITY_DETAIL_IF_TRACY_SUPPORTED(ZoneNamedNC(___tracy_scoped_zone, TAG, ::tracy::Color::COLOR_NAME, ::celerity::detail::tracy_detail::is_enabled())) + CELERITY_DETAIL_IF_TRACY_SUPPORTED(ZoneNamedNC(___tracy_scoped_zone, TAG, \ + static_cast>(::celerity::detail::trace_color::COLOR_NAME), \ + ::celerity::detail::tracy_detail::is_enabled())) #define CELERITY_DETAIL_TRACY_ZONE_NAME(...) \ CELERITY_DETAIL_IF_TRACY_ENABLED_FULL(::celerity::detail::tracy_detail::apply_string([&](const auto& n) { ZoneName(n.data(), n.size()); }, __VA_ARGS__)) diff --git a/src/backend/sycl_backend.cc b/src/backend/sycl_backend.cc index 4606128d..1283f658 100644 --- a/src/backend/sycl_backend.cc +++ b/src/backend/sycl_backend.cc @@ -213,7 +213,7 @@ sycl_backend::~sycl_backend() { const system_info& sycl_backend::get_system_info() const { return m_impl->system; } void sycl_backend::init() { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", Orange2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::init", sycl_init); // Instantiate the first in-order queue on each device. At least for CUDA systems this will perform device initialization, which can take > 100 ms / device. for(device_id did = 0; did < m_impl->system.devices.size(); ++did) { @@ -276,7 +276,7 @@ async_event sycl_backend::enqueue_device_kernel(const device_id device, const si std::vector accessor_infos, const box<3>& execution_range, const std::vector& reduction_ptrs) // { return enqueue_device_work(device, lane, [=, this, acc_infos = std::move(accessor_infos)](sycl::queue& queue) mutable { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit); auto event = queue.submit([&](sycl::handler& sycl_cgh) { auto& hydrator = closure_hydrator::get_instance(); hydrator.arm(target::device, std::move(acc_infos)); diff --git a/src/backend/sycl_cuda_backend.cc b/src/backend/sycl_cuda_backend.cc index b9b6be71..11ed2c96 100644 --- a/src/backend/sycl_cuda_backend.cc +++ b/src/backend/sycl_cuda_backend.cc @@ -38,17 +38,17 @@ void nd_copy_device_async(const cudaStream_t stream, const void* const source_ba if(layout.contiguous_size == 0) return; if(layout.num_complex_strides == 0) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_1d", ForestGreen, "cudaMemcpyAsync"); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_1d", cuda_memcpy_1d, "cudaMemcpyAsync"); CELERITY_CUDA_CHECK(cudaMemcpyAsync, static_cast(dest_base) + layout.offset_in_dest, static_cast(source_base) + layout.offset_in_source, layout.contiguous_size, cudaMemcpyDefault, stream); } else if(layout.num_complex_strides == 1) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_2d", ForestGreen, "cudaMemcpy2DAsync"); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_2d", cuda_memcpy_2d, "cudaMemcpy2DAsync"); CELERITY_CUDA_CHECK(cudaMemcpy2DAsync, static_cast(dest_base) + layout.offset_in_dest, layout.strides[0].dest_stride, static_cast(source_base) + layout.offset_in_source, layout.strides[0].source_stride, layout.contiguous_size, layout.strides[0].count, cudaMemcpyDefault, stream); } else { assert(layout.num_complex_strides == 2); - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_3d", ForestGreen, "cudaMemcpy3DAsync"); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy_3d", cuda_memcpy_3d, "cudaMemcpy3DAsync"); // Arriving in the 3D case means no dimensionality reduction was possible, and cudaMemcpy3D is more closely aligned to the parameters to // nd_copy_device_async than to nd_copy_layout, so we don't compute cudaMemcpy3DParms from `layout`. cudaMemcpy3DParms parms = {}; @@ -81,7 +81,7 @@ void nd_copy_device_async(cudaStream_t stream, const void* const source_base, vo nd_copy_device_async(stream, source, dest, source_box, dest_box, copy_box, elem_size); }, [stream](const void* const source, void* const dest, size_t size_bytes) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy", ForestGreen, "cudaMemcpyAsync"); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::memcpy", cuda_memcpy, "cudaMemcpyAsync"); CELERITY_CUDA_CHECK(cudaMemcpyAsync, dest, source, size_bytes, cudaMemcpyDefault, stream); }); } @@ -101,7 +101,7 @@ struct cuda_native_event_deleter { using unique_cuda_native_event = std::unique_ptr, cuda_native_event_deleter>; unique_cuda_native_event record_native_event(const cudaStream_t stream, bool enable_profiling) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::record_event", ForestGreen, "cudaEventRecord") + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("cuda::record_event", cuda_record_event, "cudaEventRecord") cudaEvent_t event; CELERITY_CUDA_CHECK(cudaEventCreateWithFlags, &event, enable_profiling ? cudaEventDefault : cudaEventDisableTiming); CELERITY_CUDA_CHECK(cudaEventRecord, event, stream); diff --git a/src/backend/sycl_generic_backend.cc b/src/backend/sycl_generic_backend.cc index 56a205aa..de0bb09f 100644 --- a/src/backend/sycl_generic_backend.cc +++ b/src/backend/sycl_generic_backend.cc @@ -27,7 +27,7 @@ void nd_copy_device_chunked(sycl::queue& queue, const void* const source_base, v const auto layout = layout_nd_copy(source_box.get_range(), dest_box.get_range(), copy_box.get_offset() - source_box.get_offset(), copy_box.get_offset() - dest_box.get_offset(), copy_box.get_range(), elem_size); for_each_contiguous_chunk(layout, [&](const size_t chunk_offset_in_source, const size_t chunk_offset_in_dest, const size_t chunk_size) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit); // first, last: We remember the first and last submission event to report completion time spanning the entire region copy last = queue.memcpy( static_cast(dest_base) + chunk_offset_in_dest, static_cast(source_base) + chunk_offset_in_source, chunk_size); @@ -47,7 +47,7 @@ async_event nd_copy_device_generic(sycl::queue& queue, const void* const source_ [&queue, elem_size, enable_profiling, &first, &last](const void* const source, void* const dest, const box<3>& source_box, const box<3>& dest_box, const box<3>& copy_box) { nd_copy_device_chunked(queue, source, dest, source_box, dest_box, copy_box, elem_size, enable_profiling, first, last); }, [&queue, enable_profiling, &first, &last](const void* const source, void* const dest, size_t size_bytes) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", Orange2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("sycl::submit", sycl_submit); last = queue.memcpy(dest, source, size_bytes); if(enable_profiling) { first = last; } }); diff --git a/src/instruction_graph_generator.cc b/src/instruction_graph_generator.cc index 36a45683..3aa669b1 100644 --- a/src/instruction_graph_generator.cc +++ b/src/instruction_graph_generator.cc @@ -992,7 +992,7 @@ void generator_impl::free_all_staging_allocations(batch& current_batch) { } void generator_impl::allocate_contiguously(batch& current_batch, const buffer_id bid, const memory_id mid, box_vector<3>&& required_contiguous_boxes) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::allocate", Teal); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::allocate", iggen_allocate); if(required_contiguous_boxes.empty()) return; @@ -1208,7 +1208,7 @@ bool should_linearize_copy_region(const memory_id alloc_mid, const box<3>& alloc void generator_impl::establish_coherence_between_buffer_memories( batch& current_batch, const buffer_id bid, dense_map>>& concurrent_reads_from_memory) // { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::coherence", Red2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::coherence", iggen_coherence); auto& buffer = m_buffers.at(bid); @@ -1494,7 +1494,7 @@ void generator_impl::create_task_collective_groups(batch& command_batch, const t } std::vector generator_impl::split_task_execution_range(const execution_command& ecmd, const task& tsk) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::split_task", Maroon); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::split_task", iggen_split_task); if(tsk.get_execution_target() == execution_target::device && m_system.devices.empty()) { utils::panic("no device on which to execute device kernel"); } @@ -1569,7 +1569,7 @@ void generator_impl::report_task_overlapping_writes(const task& tsk, const std:: void generator_impl::satisfy_task_buffer_requirements(batch& current_batch, const buffer_id bid, const task& tsk, const subrange<3>& local_execution_range, const bool local_node_is_reduction_initializer, const std::vector& concurrent_chunks_after_split) // { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::satisfy_buffer_requirements", ForestGreen); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::satisfy_buffer_requirements", iggen_satisfy_buffer_requirements); assert(!concurrent_chunks_after_split.empty()); @@ -1796,7 +1796,7 @@ void generator_impl::finish_task_local_reduction(batch& command_batch, const loc } instruction* generator_impl::launch_task_kernel(batch& command_batch, const execution_command& ecmd, const task& tsk, const localized_chunk& chunk) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::launch_kernel", Blue2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::launch_kernel", iggen_launch_kernel); const auto& bam = tsk.get_buffer_access_map(); @@ -1866,7 +1866,7 @@ instruction* generator_impl::launch_task_kernel(batch& command_batch, const exec void generator_impl::perform_task_buffer_accesses( const task& tsk, const std::vector& concurrent_chunks, const std::vector& command_instructions) // { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::perform_buffer_access", Red3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::perform_buffer_access", iggen_perform_buffer_access); const auto& bam = tsk.get_buffer_access_map(); if(bam.get_num_accesses() == 0 && tsk.get_reductions().empty()) return; @@ -2320,7 +2320,7 @@ void generator_impl::flush_batch(batch&& batch) { // NOLINT(cppcoreguidelines-rv } instruction_graph_generator::scheduling_hint generator_impl::anticipate(const command& cmd) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::anticipate", Teal); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("iggen::anticipate", iggen_anticipate); std::unordered_map, box_vector<3>, utils::pair_hash> required_contiguous_boxes; const auto require_contiguous = [&](const buffer_id bid, const memory_id mid, const box_vector<3>& boxes) { diff --git a/src/live_executor.cc b/src/live_executor.cc index b31b00ec..c277f049 100644 --- a/src/live_executor.cc +++ b/src/live_executor.cc @@ -415,7 +415,7 @@ void executor_impl::run() { if(engine.is_idle()) { if(!expecting_more_submissions) break; // shutdown complete - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::starve", DarkSlateGray); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::starve", executor_starve); submission_queue->wait_while_empty(); // we are stalled on the scheduler, suspend thread last_progress_timestamp.reset(); // do not treat suspension as being stuck } @@ -458,7 +458,7 @@ void executor_impl::poll_in_flight_async_instructions() { void executor_impl::poll_submission_queue() { for(auto& submission : submission_queue->pop_all()) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::fetch", Gray); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::fetch", executor_fetch); matchbox::match( submission, [&](const instruction_pilot_batch& batch) { @@ -488,7 +488,7 @@ void executor_impl::poll_submission_queue() { } void executor_impl::retire_async_instruction(const instruction_id iid, async_instruction_state& async) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::retire", Brown); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::retire", executor_retire); #if CELERITY_ACCESSOR_BOUNDARY_CHECK if(async.oob_info != nullptr) { @@ -590,7 +590,7 @@ void executor_impl::try_issue_one_instruction() { CELERITY_DETAIL_IF_TRACY_ENABLED(tracy->assignment_queue_length_plot.update(engine.get_assignment_queue_length())); - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::issue", Blue); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::issue", executor_issue); matchbox::match(*assignment->instruction, [&](const auto& instr) { dispatch(instr, *assignment); }); made_progress = true; } @@ -745,7 +745,7 @@ void executor_impl::issue_async(const free_instruction& finstr, const out_of_ord } void executor_impl::issue_async(const copy_instruction& cinstr, const out_of_order_engine::assignment& assignment, async_instruction_state& async) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::issue_copy", Green4); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::issue_copy", executor_issue_copy); assert(assignment.target == out_of_order_engine::target::host_queue || assignment.target == out_of_order_engine::target::device_queue); assert((assignment.target == out_of_order_engine::target::device_queue) == assignment.device.has_value()); @@ -779,7 +779,7 @@ std::string format_access_log(const buffer_access_allocation_map& map) { } void executor_impl::issue_async(const device_kernel_instruction& dkinstr, const out_of_order_engine::assignment& assignment, async_instruction_state& async) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::issue_device_kernel", Yellow2); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::issue_device_kernel", executor_issue_device_kernel); assert(assignment.target == out_of_order_engine::target::device_queue); assert(assignment.device == dkinstr.get_device_id()); @@ -890,7 +890,7 @@ void executor_impl::collect(const instruction_garbage& garbage) { } std::vector executor_impl::make_accessor_infos(const buffer_access_allocation_map& amap) const { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::make_accessor_info", Magenta3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::make_accessor_info", executor_make_accessor_info); std::vector accessor_infos(amap.size()); for(size_t i = 0; i < amap.size(); ++i) { @@ -906,7 +906,7 @@ std::unique_ptr executor_impl::attach_boundary_check_info(s { if(amap.empty()) return nullptr; - CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::oob_init", Red); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("executor::oob_init", executor_oob_init); auto oob_info = std::make_unique(tt, tid, task_name); oob_info->illegal_access_bounding_boxes = static_cast(backend->debug_alloc(amap.size() * sizeof(oob_bounding_box))); diff --git a/src/out_of_order_engine.cc b/src/out_of_order_engine.cc index 1100b6b3..ae9f2089 100644 --- a/src/out_of_order_engine.cc +++ b/src/out_of_order_engine.cc @@ -229,7 +229,7 @@ void engine_impl::try_mark_for_assignment(incomplete_instruction_state& node) { } void engine_impl::submit(const instruction* const instr) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("out_of_order_engine::submit", Blue3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("out_of_order_engine::submit", out_of_order_engine_submit); const auto iid = instr->get_id(); auto [node_it, inserted] = incomplete_instructions.emplace(iid, incomplete_instruction_state(instr)); @@ -313,7 +313,7 @@ void engine_impl::submit(const instruction* const instr) { } void engine_impl::complete(const instruction_id iid) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("out_of_order_engine::complete", Blue3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("out_of_order_engine::complete", out_of_order_engine_complete); const auto node_it = incomplete_instructions.find(iid); assert(node_it != incomplete_instructions.end()); @@ -403,7 +403,7 @@ incomplete_instruction_state* engine_impl::pop_assignable() { std::optional engine_impl::assign_one() { if(assignment_queue.empty()) return std::nullopt; // Don't begin a Tracy zone if there is nothing to assign - CELERITY_DETAIL_TRACY_ZONE_SCOPED("out_of_order_engine::assign", Blue3); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("out_of_order_engine::assign", out_of_order_engine_assign); const auto node_ptr = pop_assignable(); if(node_ptr == nullptr) return std::nullopt; diff --git a/src/runtime.cc b/src/runtime.cc index 6f429156..d739b386 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -281,7 +281,7 @@ namespace detail { m_cfg = std::make_unique(argc, argv); CELERITY_DETAIL_IF_TRACY_SUPPORTED(tracy_detail::g_tracy_mode = m_cfg->get_tracy_mode()); - CELERITY_DETAIL_TRACY_ZONE_SCOPED("runtime::startup", DarkGray); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("runtime::startup", runtime_startup); if(s_test_mode) { assert(s_test_active && "initializing the runtime from a test without a runtime_fixture"); @@ -331,7 +331,7 @@ namespace detail { std::vector devices; { - CELERITY_DETAIL_TRACY_ZONE_SCOPED("runtime::select_devices", PaleVioletRed); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("runtime::select_devices", runtime_select_devices); devices = std::visit([&](const auto& value) { return select_devices(host_cfg, value, sycl::platform::get_platforms()); }, user_devices_or_selector); assert(!devices.empty()); // postcondition of select_devices } @@ -421,7 +421,7 @@ namespace detail { require_call_from_application_thread(); - CELERITY_DETAIL_TRACY_ZONE_SCOPED("runtime::shutdown", DimGray); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("runtime::shutdown", runtime_shutdown); // Create and await the shutdown epoch sync(epoch_action::shutdown); @@ -677,7 +677,7 @@ namespace detail { void runtime::mpi_initialize_once(int* argc, char*** argv) { #if CELERITY_ENABLE_MPI - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("mpi::init", LightSkyBlue, "MPI_Init"); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("mpi::init", mpi_init, "MPI_Init"); assert(!s_mpi_initialized); int provided = -1; MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); @@ -688,7 +688,7 @@ namespace detail { void runtime::mpi_finalize_once() { #if CELERITY_ENABLE_MPI - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("mpi::finalize", LightSkyBlue, "MPI_Finalize"); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("mpi::finalize", mpi_finalize, "MPI_Finalize"); assert(s_mpi_initialized && !s_mpi_finalized && (!s_test_mode || !has_instance())); MPI_Finalize(); #endif // CELERITY_ENABLE_MPI diff --git a/src/scheduler.cc b/src/scheduler.cc index de8f3d7a..09aa1e61 100644 --- a/src/scheduler.cc +++ b/src/scheduler.cc @@ -208,7 +208,7 @@ void scheduler_impl::process_task_queue_event(const task_event& evt) { auto& tsk = *e.tsk; const auto commands = [&] { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::build_task", WebMaroon, "T{} build", tsk.get_id()); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::build_task", scheduler_build_task, "T{} build", tsk.get_id()); CELERITY_DETAIL_TRACY_ZONE_TEXT(utils::make_task_debug_label(tsk.get_type(), tsk.get_id(), tsk.get_debug_name())); return cggen.build_task(tsk); }(); // IIFE @@ -226,7 +226,7 @@ void scheduler_impl::process_task_queue_event(const task_event& evt) { }, [&](const event_buffer_created& e) { assert(!shutdown_epoch_created && !shutdown_epoch_reached); - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_created", DarkGreen, "B{} create", e.bid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_created", scheduler_buffer_created, "B{} create", e.bid); cggen.notify_buffer_created(e.bid, e.range, e.user_allocation_id != null_allocation_id); // Buffer creation must be applied immediately (and out-of-order when necessary) so that instruction_graph_generator::anticipate() does not operate // on unknown buffers. This is fine as buffer creation never has dependencies on other commands and we do not re-use buffer ids. @@ -234,21 +234,21 @@ void scheduler_impl::process_task_queue_event(const task_event& evt) { }, [&](const event_buffer_debug_name_changed& e) { assert(!shutdown_epoch_created && !shutdown_epoch_reached); - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_name_changed", DarkGreen, "B{} set name", e.bid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_name_changed", scheduler_buffer_name_changed, "B{} set name", e.bid); cggen.notify_buffer_debug_name_changed(e.bid, e.debug_name); // buffer-name changes are enqueued in-order to ensure that instruction records have the buffer names as they existed at task creation time. command_queue.push(e); }, [&](const event_buffer_destroyed& e) { assert(!shutdown_epoch_created && !shutdown_epoch_reached); - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_destroyed", DarkGreen, "B{} destroy", e.bid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_destroyed", scheduler_buffer_destroyed, "B{} destroy", e.bid); cggen.notify_buffer_destroyed(e.bid); // host-object destruction must happen in-order, otherwise iggen would need to compile commands on already-deleted buffers. command_queue.push(e); }, [&](const event_host_object_created& e) { assert(!shutdown_epoch_created && !shutdown_epoch_reached); - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::host_object_created", DarkGreen, "H{} create", e.hoid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::host_object_created", scheduler_host_object_created, "H{} create", e.hoid); cggen.notify_host_object_created(e.hoid); // instruction_graph_generator::anticipate() does not examine host objects (unlike it does with buffers), but it doesn't hurt to create them early // either since we don't re-use host object ids. @@ -256,13 +256,13 @@ void scheduler_impl::process_task_queue_event(const task_event& evt) { }, [&](const event_host_object_destroyed& e) { assert(!shutdown_epoch_created && !shutdown_epoch_reached); - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::host_object_destroyed", DarkGreen, "H{} destroy", e.hoid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::host_object_destroyed", scheduler_host_object_destroyed, "H{} destroy", e.hoid); cggen.notify_host_object_destroyed(e.hoid); // host-object destruction must happen in-order, otherwise iggen would need to compile commands on already-deleted host objects. command_queue.push(e); }, [&](const event_epoch_reached& e) { // - CELERITY_DETAIL_TRACY_ZONE_SCOPED("scheduler::prune", Gray); + CELERITY_DETAIL_TRACY_ZONE_SCOPED("scheduler::prune", scheduler_prune); cdag.erase_before_epoch(e.tid); idag.erase_before_epoch(e.tid); @@ -285,20 +285,20 @@ void scheduler_impl::process_command_queue_event(const command_event& evt) { matchbox::match( evt, // [&](const event_command_available& e) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::compile_command", MidnightBlue, "C{} compile", e.cmd->get_id()); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::compile_command", scheduler_compile_command, "C{} compile", e.cmd->get_id()); CELERITY_DETAIL_TRACY_ZONE_TEXT("{}", print_command_type(*e.cmd)); iggen.compile(*e.cmd); }, [&](const event_buffer_debug_name_changed& e) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_name_changed", DarkGreen, "B{} set name", e.bid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_name_changed", scheduler_buffer_name_changed, "B{} set name", e.bid); iggen.notify_buffer_debug_name_changed(e.bid, e.debug_name); }, [&](const event_buffer_destroyed& e) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_destroyed", DarkGreen, "B{} destroy", e.bid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::buffer_destroyed", scheduler_buffer_destroyed, "B{} destroy", e.bid); iggen.notify_buffer_destroyed(e.bid); }, [&](const event_host_object_destroyed& e) { - CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::host_object_destroyed", DarkGreen, "H{} destroy", e.hoid); + CELERITY_DETAIL_TRACY_ZONE_SCOPED_V("scheduler::host_object_destroyed", scheduler_host_object_destroyed, "H{} destroy", e.hoid); iggen.notify_host_object_destroyed(e.hoid); }, [&](const event_set_lookahead& e) {