Skip to content

Commit

Permalink
Memory profiler: add support for distconv-enabled layers (#2458)
Browse files Browse the repository at this point in the history
* Memory profiler: add support for distconv-enabled layers

* Fix compilation without distconv

* Update src/callbacks/memory_profiler.cpp

Co-authored-by: Tom Benson <[email protected]>

* Address review comment

---------

Co-authored-by: Tom Benson <[email protected]>
  • Loading branch information
tbennun and benson31 authored Jun 28, 2024
1 parent c920d33 commit fa569da
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 15 deletions.
6 changes: 6 additions & 0 deletions include/lbann/callbacks/memory_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ class memory_profiler : public callback_base
/** Unaccounted memory in bytes during backpropagation */
std::map<Layer*, size_t> m_unaccounted_bp_layer;

/** Activation sizes in bytes per layer */
std::map<Layer*, size_t> m_act_sizes;

/** Activation shape report per layer */
std::map<Layer*, std::string> m_act_report;

/** Current step, used for tracking memory usage. */
int m_current_step;

Expand Down
81 changes: 66 additions & 15 deletions src/callbacks/memory_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@

#include "lbann/proto/callbacks.pb.h"

#ifdef LBANN_HAS_DISTCONV
#include "lbann/layers/distconv_adapter.hpp"
#endif

#include "h2/patterns/multimethods/SwitchDispatcher.hpp"

#include <algorithm>
Expand Down Expand Up @@ -82,19 +86,62 @@ size_t report_dist_matrix(El::AbstractDistMatrix<T> const& m,
return allocated;
}

#ifdef LBANN_HAS_DISTCONV
template <typename T>
size_t report_distconv_matrix(::lbann::dc::TensorDev<T> const& m,
std::ostream& stream)
{
size_t const allocated = m.get_local_real_size() * sizeof(T);
auto const& shp = m.get_shape();
auto const& lshp = m.get_local_real_shape();
stream << shp[0];
for (int i = 1; i < shp.num_dims(); ++i) {
stream << " x " << shp[i];
}
stream << " (local shape (with halo): " << lshp[0];
for (int i = 1; i < lshp.num_dims(); ++i) {
stream << " x " << lshp[i];
}
stream << "). Size: " << allocated / 1048576.0 << " MiB" << std::endl;
return allocated;
}
#else
template <typename T>
size_t report_distconv_matrix(T const& m, std::ostream& stream)
{
stream << "Distconv is disabled" << std::endl;
return 0;
}
#endif

template <typename T>
size_t get_activation_and_error_signal_size(data_type_layer<T> const& dtl,
std::ostream& reps)
{
size_t allocated = 0;
for (int i = 0; i < dtl.get_num_children(); ++i) {
auto const& act = dtl.get_activations(i);
if (dtl.get_num_children() == 1)
reps << " Activations: ";
else
reps << " Activations (" << i << "): ";

allocated += report_dist_matrix(act, reps);
if (dtl.distconv_enabled()) {
#ifdef LBANN_HAS_DISTCONV
auto const& child = dtl.get_child_layer(i);
auto const& dcact = dtl.get_distconv_adapter().get_activations(child);
allocated += report_distconv_matrix(dcact, reps);
// Add activations if child layer is not distconv-enabled
if (!child.distconv_enabled()) {
auto const& act = dtl.get_activations(i);
reps << " + non-distconv adapter: ";
allocated += report_dist_matrix(act, reps);
}
#endif
}
else {
auto const& act = dtl.get_activations(i);
allocated += report_dist_matrix(act, reps);
}
}
return allocated;
}
Expand Down Expand Up @@ -290,8 +337,8 @@ void memory_profiler::report_mem_usage(model* m)

// Get maximal activation/error signal size (suboptimal approximation)
{
size_t const allocated =
get_activation_and_error_signal_size(*layer, reps);
size_t const allocated = m_act_sizes[layer];
reps << m_act_report[layer];
layer_total_acts += allocated;
layer_total += allocated;
}
Expand Down Expand Up @@ -426,13 +473,6 @@ void memory_profiler::on_setup_end(model* m)
auto comm = m->get_comm();
bool should_print = comm->am_trainer_master();

// Post-setup printout of layer accounting
if (should_print) {
std::cout << "MEM: Expected memory usage by layer (in descending order):"
<< std::endl;
report_mem_usage(m);
}

// Print total used memory
m_step0_usage = m_setup_end_usage = get_used_gpu_memory();
if (m_setup_end_usage > m_initial_memory_usage && should_print) {
Expand Down Expand Up @@ -531,11 +571,18 @@ void memory_profiler::on_batch_end(model* m)
break;
}

// Check for and print leak report
if (m_current_step == 2) {
auto comm = m->get_comm();
bool should_print = comm->am_trainer_master();
auto comm = m->get_comm();
bool should_print = comm->am_trainer_master();

// Print collected activation and weight size
if (should_print && m_current_step == 0) {
std::cout << "MEM: Memory usage by layer (in descending order):"
<< std::endl;
report_mem_usage(m);
}

// Check for and print leak report
if (should_print && m_current_step == 2) {
double third_step = m_step2_usage > m_step1_usage
? (m_step2_usage - m_step1_usage) / 1048576.0
: 0.0;
Expand Down Expand Up @@ -646,6 +693,10 @@ void memory_profiler::on_forward_prop_end(model* m, Layer* l)
m_unaccounted_fp_layer[l] = 0;
}
collect_peak_usage();

std::ostringstream ss;
m_act_sizes[l] = get_activation_and_error_signal_size(*l, ss);
m_act_report[l] = ss.str();
}
}
void memory_profiler::on_backward_prop_begin(model* m, Layer* l)
Expand Down

0 comments on commit fa569da

Please sign in to comment.