From d34e698fe1a7e8c923d75392fe38fd4af6a435a7 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Thu, 20 Jun 2024 17:23:32 -0700 Subject: [PATCH 1/4] Memory profiler: add support for distconv-enabled layers --- include/lbann/callbacks/memory_profiler.hpp | 6 ++ src/callbacks/memory_profiler.cpp | 73 ++++++++++++++++----- 2 files changed, 64 insertions(+), 15 deletions(-) diff --git a/include/lbann/callbacks/memory_profiler.hpp b/include/lbann/callbacks/memory_profiler.hpp index dc97e77e0ab..e79a0ecea82 100644 --- a/include/lbann/callbacks/memory_profiler.hpp +++ b/include/lbann/callbacks/memory_profiler.hpp @@ -116,6 +116,12 @@ class memory_profiler : public callback_base /** Unaccounted memory in bytes during backpropagation */ std::map m_unaccounted_bp_layer; + /** Activation sizes in bytes per layer */ + std::map m_act_sizes; + + /** Activation shape report per layer */ + std::map m_act_report; + /** Current step, used for tracking memory usage. */ int m_current_step; diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index 8d770a8ab8a..e60fcb64f61 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -36,6 +36,10 @@ #include "lbann/proto/callbacks.pb.h" +#ifdef LBANN_HAS_DISTCONV +#include "lbann/layers/distconv_adapter.hpp" +#endif + #include "h2/patterns/multimethods/SwitchDispatcher.hpp" #include @@ -82,19 +86,54 @@ size_t report_dist_matrix(El::AbstractDistMatrix const& m, return allocated; } +#ifdef LBANN_HAS_DISTCONV +template +size_t report_distconv_matrix(::lbann::dc::TensorDev const& m, + std::ostream& stream) +{ + size_t const allocated = m.get_local_real_size() * sizeof(T); + auto const& shp = m.get_shape(); + auto const& lshp = m.get_local_real_shape(); + stream << shp[0]; + for (int i = 1; i < shp.num_dims(); ++i) { + stream << " x " << shp[i]; + } + stream << " (local shape (with halo): " << lshp[0]; + for (int i = 1; i < lshp.num_dims(); ++i) { + stream << " x " << lshp[i]; + } + stream << "). Size: " << allocated / 1048576.0 << " MiB" << std::endl; + return allocated; +} +#else +template +size_t report_distconv_matrix(T const& m, std::ostream& stream) +{ + stream << "Distconv is disabled" << std::endl; + return 0; +} +#endif + template size_t get_activation_and_error_signal_size(data_type_layer const& dtl, std::ostream& reps) { size_t allocated = 0; for (int i = 0; i < dtl.get_num_children(); ++i) { - auto const& act = dtl.get_activations(i); if (dtl.get_num_children() == 1) reps << " Activations: "; else reps << " Activations (" << i << "): "; - allocated += report_dist_matrix(act, reps); + if (dtl.distconv_enabled()) { + auto const& child = dtl.get_child_layer(i); + auto const& dcact = dtl.get_distconv_adapter().get_activations(child); + allocated += report_distconv_matrix(dcact, reps); + } + else { + auto const& act = dtl.get_activations(i); + allocated += report_dist_matrix(act, reps); + } } return allocated; } @@ -290,8 +329,8 @@ void memory_profiler::report_mem_usage(model* m) // Get maximal activation/error signal size (suboptimal approximation) { - size_t const allocated = - get_activation_and_error_signal_size(*layer, reps); + size_t const allocated = m_act_sizes[layer]; + reps << m_act_report[layer]; layer_total_acts += allocated; layer_total += allocated; } @@ -426,13 +465,6 @@ void memory_profiler::on_setup_end(model* m) auto comm = m->get_comm(); bool should_print = comm->am_trainer_master(); - // Post-setup printout of layer accounting - if (should_print) { - std::cout << "MEM: Expected memory usage by layer (in descending order):" - << std::endl; - report_mem_usage(m); - } - // Print total used memory m_step0_usage = m_setup_end_usage = get_used_gpu_memory(); if (m_setup_end_usage > m_initial_memory_usage && should_print) { @@ -531,11 +563,18 @@ void memory_profiler::on_batch_end(model* m) break; } - // Check for and print leak report - if (m_current_step == 2) { - auto comm = m->get_comm(); - bool should_print = comm->am_trainer_master(); + auto comm = m->get_comm(); + bool should_print = comm->am_trainer_master(); + + // Print collected activation and weight size + if (should_print && m_current_step == 0) { + std::cout << "MEM: Memory usage by layer (in descending order):" + << std::endl; + report_mem_usage(m); + } + // Check for and print leak report + if (should_print && m_current_step == 2) { double third_step = m_step2_usage > m_step1_usage ? (m_step2_usage - m_step1_usage) / 1048576.0 : 0.0; @@ -646,6 +685,10 @@ void memory_profiler::on_forward_prop_end(model* m, Layer* l) m_unaccounted_fp_layer[l] = 0; } collect_peak_usage(); + + std::stringstream ss; + m_act_sizes[l] = get_activation_and_error_signal_size(*l, ss); + m_act_report[l] = ss.str(); } } void memory_profiler::on_backward_prop_begin(model* m, Layer* l) From 6202289b3a7944fcfdb1af0e19b0d227ecf42ebb Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Fri, 21 Jun 2024 10:36:14 -0700 Subject: [PATCH 2/4] Fix compilation without distconv --- src/callbacks/memory_profiler.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index e60fcb64f61..0fb27c281de 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -126,9 +126,11 @@ size_t get_activation_and_error_signal_size(data_type_layer const& dtl, reps << " Activations (" << i << "): "; if (dtl.distconv_enabled()) { +#ifdef LBANN_HAS_DISTCONV auto const& child = dtl.get_child_layer(i); auto const& dcact = dtl.get_distconv_adapter().get_activations(child); allocated += report_distconv_matrix(dcact, reps); +#endif } else { auto const& act = dtl.get_activations(i); From 5b52cbf7e3e92fa2bcf6b388b711dee718cbdf52 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Mon, 24 Jun 2024 14:42:27 -0700 Subject: [PATCH 3/4] Update src/callbacks/memory_profiler.cpp Co-authored-by: Tom Benson --- src/callbacks/memory_profiler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index 0fb27c281de..b7a8b1f0a69 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -688,7 +688,7 @@ void memory_profiler::on_forward_prop_end(model* m, Layer* l) } collect_peak_usage(); - std::stringstream ss; + std::ostringstream ss; m_act_sizes[l] = get_activation_and_error_signal_size(*l, ss); m_act_report[l] = ss.str(); } From fd30e27bb3f3b4e77921942de7a21a7da929d876 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Fri, 28 Jun 2024 09:05:37 -0700 Subject: [PATCH 4/4] Address review comment --- src/callbacks/memory_profiler.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index b7a8b1f0a69..bf2a27f5abe 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -130,6 +130,12 @@ size_t get_activation_and_error_signal_size(data_type_layer const& dtl, auto const& child = dtl.get_child_layer(i); auto const& dcact = dtl.get_distconv_adapter().get_activations(child); allocated += report_distconv_matrix(dcact, reps); + // Add activations if child layer is not distconv-enabled + if (!child.distconv_enabled()) { + auto const& act = dtl.get_activations(i); + reps << " + non-distconv adapter: "; + allocated += report_dist_matrix(act, reps); + } #endif } else {