Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory profiler: add support for distconv-enabled layers #2458

Merged
merged 4 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/lbann/callbacks/memory_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ class memory_profiler : public callback_base
/** Unaccounted memory in bytes during backpropagation */
std::map<Layer*, size_t> m_unaccounted_bp_layer;

/** Activation sizes in bytes per layer */
std::map<Layer*, size_t> m_act_sizes;

/** Activation shape report per layer */
std::map<Layer*, std::string> m_act_report;

/** Current step, used for tracking memory usage. */
int m_current_step;

Expand Down
75 changes: 60 additions & 15 deletions src/callbacks/memory_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@

#include "lbann/proto/callbacks.pb.h"

#ifdef LBANN_HAS_DISTCONV
#include "lbann/layers/distconv_adapter.hpp"
#endif

#include "h2/patterns/multimethods/SwitchDispatcher.hpp"

#include <algorithm>
Expand Down Expand Up @@ -82,19 +86,56 @@ size_t report_dist_matrix(El::AbstractDistMatrix<T> const& m,
return allocated;
}

#ifdef LBANN_HAS_DISTCONV
template <typename T>
size_t report_distconv_matrix(::lbann::dc::TensorDev<T> const& m,
std::ostream& stream)
{
size_t const allocated = m.get_local_real_size() * sizeof(T);
auto const& shp = m.get_shape();
auto const& lshp = m.get_local_real_shape();
stream << shp[0];
for (int i = 1; i < shp.num_dims(); ++i) {
stream << " x " << shp[i];
}
stream << " (local shape (with halo): " << lshp[0];
for (int i = 1; i < lshp.num_dims(); ++i) {
stream << " x " << lshp[i];
}
stream << "). Size: " << allocated / 1048576.0 << " MiB" << std::endl;
return allocated;
}
#else
template <typename T>
size_t report_distconv_matrix(T const& m, std::ostream& stream)
{
stream << "Distconv is disabled" << std::endl;
return 0;
}
#endif

template <typename T>
size_t get_activation_and_error_signal_size(data_type_layer<T> const& dtl,
std::ostream& reps)
{
size_t allocated = 0;
for (int i = 0; i < dtl.get_num_children(); ++i) {
auto const& act = dtl.get_activations(i);
if (dtl.get_num_children() == 1)
reps << " Activations: ";
else
reps << " Activations (" << i << "): ";

allocated += report_dist_matrix(act, reps);
if (dtl.distconv_enabled()) {
#ifdef LBANN_HAS_DISTCONV
auto const& child = dtl.get_child_layer(i);
auto const& dcact = dtl.get_distconv_adapter().get_activations(child);
allocated += report_distconv_matrix(dcact, reps);
#endif
}
else {
auto const& act = dtl.get_activations(i);
allocated += report_dist_matrix(act, reps);
}
tbennun marked this conversation as resolved.
Show resolved Hide resolved
}
return allocated;
}
Expand Down Expand Up @@ -290,8 +331,8 @@ void memory_profiler::report_mem_usage(model* m)

// Get maximal activation/error signal size (suboptimal approximation)
{
size_t const allocated =
get_activation_and_error_signal_size(*layer, reps);
size_t const allocated = m_act_sizes[layer];
reps << m_act_report[layer];
layer_total_acts += allocated;
layer_total += allocated;
}
Expand Down Expand Up @@ -426,13 +467,6 @@ void memory_profiler::on_setup_end(model* m)
auto comm = m->get_comm();
bool should_print = comm->am_trainer_master();

// Post-setup printout of layer accounting
if (should_print) {
std::cout << "MEM: Expected memory usage by layer (in descending order):"
<< std::endl;
report_mem_usage(m);
}

// Print total used memory
m_step0_usage = m_setup_end_usage = get_used_gpu_memory();
if (m_setup_end_usage > m_initial_memory_usage && should_print) {
Expand Down Expand Up @@ -531,11 +565,18 @@ void memory_profiler::on_batch_end(model* m)
break;
}

// Check for and print leak report
if (m_current_step == 2) {
auto comm = m->get_comm();
bool should_print = comm->am_trainer_master();
auto comm = m->get_comm();
bool should_print = comm->am_trainer_master();

// Print collected activation and weight size
if (should_print && m_current_step == 0) {
std::cout << "MEM: Memory usage by layer (in descending order):"
<< std::endl;
report_mem_usage(m);
}

// Check for and print leak report
if (should_print && m_current_step == 2) {
double third_step = m_step2_usage > m_step1_usage
? (m_step2_usage - m_step1_usage) / 1048576.0
: 0.0;
Expand Down Expand Up @@ -646,6 +687,10 @@ void memory_profiler::on_forward_prop_end(model* m, Layer* l)
m_unaccounted_fp_layer[l] = 0;
}
collect_peak_usage();

std::stringstream ss;
tbennun marked this conversation as resolved.
Show resolved Hide resolved
m_act_sizes[l] = get_activation_and_error_signal_size(*l, ss);
m_act_report[l] = ss.str();
}
}
void memory_profiler::on_backward_prop_begin(model* m, Layer* l)
Expand Down
Loading